diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_BBS_BH.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_BBS_BH.yaml new file mode 100644 index 00000000000..921d8d9b408 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_BBS_BH.yaml @@ -0,0 +1,16503 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW4_PLR1_SIA2_SVW1_VW1_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW8_PLR1_SIA2_SVW1_VW1_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW4_PLR1_SIA2_SVW1_VW1_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW8_PLR1_SIA2_SVW1_VW1_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW4_PLR1_SIA2_SVW1_VW1_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW8_PLR1_SIA2_SVW1_VW1_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS0_GRVW8_PLR1_SIA3_SVW1_VW1_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS1_GRVW8_PLR0_SIA1_SVW1_VW1_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS1_GRVW8_PLR0_SIA1_SVW1_VW1_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS0_GRVW4_PLR0_SIA1_SVW1_VW1_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS1_GRVW4_PLR0_SIA1_SVW1_VW1_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [2, 30.7249] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [26, 51.6133] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [0, 87.5345] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [39, 141.834] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [39, 202.673] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [39, 260.064] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [42, 292.868] + - - [64, 128, 1, 64, 96, 96, 96, 128] + - [2, 54.3585] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [27, 99.4288] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [39, 172.975] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [39, 277.622] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [51, 400.909] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [39, 519.435] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [41, 584.949] + - - [64, 256, 1, 64, 96, 96, 96, 256] + - [26, 126.107] + - - [64, 256, 1, 128, 96, 96, 160, 256] + - [52, 236.646] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [39, 407.61] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [39, 633.485] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [39, 874.407] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [45, 1087.56] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [48, 1204.61] + - - [64, 512, 1, 64, 96, 96, 96, 512] + - [9, 272.145] + - - [64, 512, 1, 128, 96, 96, 160, 512] + - [37, 495.722] + - - [64, 512, 1, 256, 96, 96, 288, 512] + - [45, 844.776] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [39, 1307.76] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [53, 1801.19] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [42, 2232.65] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [41, 2441.21] + - - [64, 1024, 1, 64, 96, 96, 96, 1024] + - [39, 567.718] + - - [64, 1024, 1, 128, 96, 96, 160, 1024] + - [43, 1078.37] + - - [64, 1024, 1, 256, 96, 96, 288, 1024] + - [52, 1809.06] + - - [64, 1024, 1, 512, 96, 96, 544, 1024] + - [40, 2774.47] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [39, 3752.04] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [43, 4494.46] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [48, 4962.21] + - - [64, 2048, 1, 64, 96, 96, 96, 2048] + - [43, 1198.37] + - - [64, 2048, 1, 128, 96, 96, 160, 2048] + - [39, 2173.5] + - - [64, 2048, 1, 256, 96, 96, 288, 2048] + - [37, 3615.39] + - - [64, 2048, 1, 512, 96, 96, 544, 2048] + - [52, 5323.99] + - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] + - [52, 7465.67] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [44, 9106.61] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [40, 9173.37] + - - [64, 4096, 1, 64, 96, 96, 96, 4096] + - [5, 2062.6] + - - [64, 4096, 1, 128, 96, 96, 160, 4096] + - [22, 3754.97] + - - [64, 4096, 1, 256, 96, 96, 288, 4096] + - [24, 6284.81] + - - [64, 4096, 1, 512, 96, 96, 544, 4096] + - [7, 9496.76] + - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] + - [5, 13129.0] + - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] + - [23, 15629.4] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [24, 16788.0] + - - [128, 64, 1, 64, 160, 160, 128, 96] + - [2, 62.2892] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [57, 99.7888] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [45, 176.202] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [39, 278.008] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [45, 402.467] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [51, 519.017] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [41, 582.644] + - - [128, 128, 1, 64, 160, 160, 128, 128] + - [28, 135.51] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [34, 265.16] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [52, 439.794] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [15, 650.331] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [52, 905.116] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [45, 1111.77] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [39, 1214.05] + - - [128, 256, 1, 64, 160, 160, 128, 256] + - [28, 333.305] + - - [128, 256, 1, 128, 160, 160, 160, 256] + - [45, 589.502] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [45, 979.297] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [45, 1456.86] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [39, 1914.34] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [52, 2307.1] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [55, 2480.05] + - - [128, 512, 1, 64, 160, 160, 128, 512] + - [25, 678.032] + - - [128, 512, 1, 128, 160, 160, 160, 512] + - [45, 1202.84] + - - [128, 512, 1, 256, 160, 160, 288, 512] + - [45, 1992.32] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [43, 3033.85] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [39, 4018.98] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [46, 4681.47] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [42, 5033.67] + - - [128, 1024, 1, 64, 160, 160, 128, 1024] + - [45, 1386.56] + - - [128, 1024, 1, 128, 160, 160, 160, 1024] + - [52, 2617.35] + - - [128, 1024, 1, 256, 160, 160, 288, 1024] + - [52, 4254.4] + - - [128, 1024, 1, 512, 160, 160, 544, 1024] + - [45, 5978.52] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [42, 8119.64] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [45, 9423.09] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [42, 10243.1] + - - [128, 2048, 1, 64, 160, 160, 128, 2048] + - [6, 2722.25] + - - [128, 2048, 1, 128, 160, 160, 160, 2048] + - [45, 4679.18] + - - [128, 2048, 1, 256, 160, 160, 288, 2048] + - [48, 7688.04] + - - [128, 2048, 1, 512, 160, 160, 544, 2048] + - [42, 11409.2] + - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] + - [39, 15393.7] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [46, 18850.2] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [1, 16319.5] + - - [128, 4096, 1, 64, 160, 160, 128, 4096] + - [24, 4496.71] + - - [128, 4096, 1, 128, 160, 160, 160, 4096] + - [15, 8042.79] + - - [128, 4096, 1, 256, 160, 160, 288, 4096] + - [8, 13359.0] + - - [128, 4096, 1, 512, 160, 160, 544, 4096] + - [23, 20091.0] + - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] + - [24, 27069.6] + - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] + - [8, 32463.8] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [24, 32971.3] + - - [256, 64, 1, 64, 288, 288, 256, 96] + - [2, 139.346] + - - [256, 64, 1, 128, 288, 288, 256, 160] + - [39, 229.147] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [39, 394.053] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [52, 630.344] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [38, 875.319] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [39, 1085.69] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [42, 1199.89] + - - [256, 128, 1, 64, 288, 288, 256, 128] + - [33, 314.274] + - - [256, 128, 1, 128, 288, 288, 256, 160] + - [28, 592.249] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [52, 978.492] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [39, 1416.64] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [52, 1946.54] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [52, 2302.19] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [55, 2497.22] + - - [256, 256, 1, 64, 288, 288, 256, 256] + - [9, 679.022] + - - [256, 256, 1, 128, 288, 288, 256, 256] + - [39, 1206.13] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [45, 1972.63] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [50, 2934.11] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [48, 4008.9] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [48, 4689.49] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [42, 5033.58] + - - [256, 512, 1, 64, 288, 288, 256, 512] + - [52, 1464.24] + - - [256, 512, 1, 128, 288, 288, 256, 512] + - [45, 2606.37] + - - [256, 512, 1, 256, 288, 288, 288, 512] + - [48, 4221.21] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [55, 5979.05] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [48, 7924.53] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [39, 9532.18] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [42, 10030.3] + - - [256, 1024, 1, 64, 288, 288, 256, 1024] + - [13, 2770.35] + - - [256, 1024, 1, 128, 288, 288, 256, 1024] + - [45, 4902.04] + - - [256, 1024, 1, 256, 288, 288, 288, 1024] + - [13, 7787.98] + - - [256, 1024, 1, 512, 288, 288, 544, 1024] + - [46, 11479.4] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [42, 15410.5] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [55, 18775.0] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [55, 20118.8] + - - [256, 2048, 1, 64, 288, 288, 256, 2048] + - [33, 4644.87] + - - [256, 2048, 1, 128, 288, 288, 256, 2048] + - [6, 8300.44] + - - [256, 2048, 1, 256, 288, 288, 288, 2048] + - [24, 13617.9] + - - [256, 2048, 1, 512, 288, 288, 544, 2048] + - [15, 20187.7] + - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] + - [15, 26389.7] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [7, 31808.0] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [8, 34198.9] + - - [256, 4096, 1, 64, 288, 288, 256, 4096] + - [26, 8185.01] + - - [256, 4096, 1, 128, 288, 288, 256, 4096] + - [8, 14367.1] + - - [256, 4096, 1, 256, 288, 288, 288, 4096] + - [7, 21442.2] + - - [256, 4096, 1, 512, 288, 288, 544, 4096] + - [13, 27799.9] + - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] + - [6, 33428.1] + - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] + - [6, 36071.0] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [13, 38653.0] + - - [384, 64, 1, 64, 416, 416, 384, 96] + - [2, 215.196] + - - [384, 64, 1, 128, 416, 416, 384, 160] + - [45, 358.733] + - - [384, 64, 1, 256, 416, 416, 384, 288] + - [45, 610.88] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [49, 945.16] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [39, 1351.04] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [41, 1636.59] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [48, 1809.91] + - - [384, 128, 1, 64, 416, 416, 384, 128] + - [18, 486.881] + - - [384, 128, 1, 128, 416, 416, 384, 160] + - [52, 876.858] + - - [384, 128, 1, 256, 416, 416, 384, 288] + - [39, 1448.81] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [52, 2157.38] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [52, 2900.63] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [42, 3499.39] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [55, 3726.55] + - - [384, 256, 1, 64, 416, 416, 384, 256] + - [56, 1086.42] + - - [384, 256, 1, 128, 416, 416, 384, 256] + - [39, 1929.01] + - - [384, 256, 1, 256, 416, 416, 384, 288] + - [48, 3010.28] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [39, 4460.44] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [39, 5901.59] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [55, 7120.05] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [48, 7472.88] + - - [384, 512, 1, 64, 416, 416, 384, 512] + - [55, 2105.93] + - - [384, 512, 1, 128, 416, 416, 384, 512] + - [37, 3720.55] + - - [384, 512, 1, 256, 416, 416, 384, 512] + - [39, 6111.18] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [55, 8732.11] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [48, 11650.8] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [54, 13892.7] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [46, 14912.5] + - - [384, 1024, 1, 64, 416, 416, 384, 1024] + - [15, 3460.19] + - - [384, 1024, 1, 128, 416, 416, 384, 1024] + - [29, 6184.79] + - - [384, 1024, 1, 256, 416, 416, 384, 1024] + - [24, 10509.8] + - - [384, 1024, 1, 512, 416, 416, 544, 1024] + - [24, 15533.3] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [15, 20049.5] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [22, 23903.4] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [15, 25873.3] + - - [384, 2048, 1, 64, 416, 416, 384, 2048] + - [33, 5974.08] + - - [384, 2048, 1, 128, 416, 416, 384, 2048] + - [24, 11479.4] + - - [384, 2048, 1, 256, 416, 416, 384, 2048] + - [15, 17897.3] + - - [384, 2048, 1, 512, 416, 416, 544, 2048] + - [8, 24819.9] + - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] + - [6, 30470.6] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [23, 33605.6] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [6, 36697.9] + - - [384, 4096, 1, 64, 416, 416, 384, 4096] + - [33, 10317.0] + - - [384, 4096, 1, 128, 416, 416, 384, 4096] + - [19, 16366.7] + - - [384, 4096, 1, 256, 416, 416, 384, 4096] + - [52, 23518.1] + - - [384, 4096, 1, 512, 416, 416, 544, 4096] + - [55, 29611.2] + - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] + - [55, 32562.0] + - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] + - [48, 35377.9] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [42, 36190.9] + - - [768, 64, 1, 64, 800, 800, 768, 96] + - [18, 469.512] + - - [768, 64, 1, 128, 800, 800, 768, 160] + - [37, 787.022] + - - [768, 64, 1, 256, 800, 800, 768, 288] + - [24, 1303.66] + - - [768, 64, 1, 512, 800, 800, 768, 544] + - [39, 1976.27] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [48, 2740.63] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [48, 3280.97] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [47, 3603.29] + - - [768, 128, 1, 64, 800, 800, 768, 128] + - [9, 970.454] + - - [768, 128, 1, 128, 800, 800, 768, 160] + - [43, 1825.46] + - - [768, 128, 1, 256, 800, 800, 768, 288] + - [52, 2986.33] + - - [768, 128, 1, 512, 800, 800, 768, 544] + - [45, 4419.72] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [48, 5939.9] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [48, 6919.87] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [48, 7420.68] + - - [768, 256, 1, 64, 800, 800, 768, 256] + - [29, 2114.06] + - - [768, 256, 1, 128, 800, 800, 768, 256] + - [8, 3705.22] + - - [768, 256, 1, 256, 800, 800, 768, 288] + - [45, 5881.93] + - - [768, 256, 1, 512, 800, 800, 768, 544] + - [45, 8712.44] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [39, 11451.4] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [42, 13812.2] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [55, 14830.2] + - - [768, 512, 1, 64, 800, 800, 768, 512] + - [15, 3621.5] + - - [768, 512, 1, 128, 800, 800, 768, 512] + - [13, 6408.43] + - - [768, 512, 1, 256, 800, 800, 768, 512] + - [8, 10534.0] + - - [768, 512, 1, 512, 800, 800, 768, 544] + - [42, 15170.4] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [24, 20003.6] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [24, 24272.4] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [8, 25795.0] + - - [768, 1024, 1, 64, 800, 800, 768, 1024] + - [29, 6152.28] + - - [768, 1024, 1, 128, 800, 800, 768, 1024] + - [16, 10627.5] + - - [768, 1024, 1, 256, 800, 800, 768, 1024] + - [24, 17379.8] + - - [768, 1024, 1, 512, 800, 800, 768, 1024] + - [22, 24322.1] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [22, 30392.4] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [13, 33606.3] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [22, 36829.2] + - - [768, 2048, 1, 64, 800, 800, 768, 2048] + - [36, 10766.1] + - - [768, 2048, 1, 128, 800, 800, 768, 2048] + - [19, 17115.3] + - - [768, 2048, 1, 256, 800, 800, 768, 2048] + - [10, 24102.3] + - - [768, 2048, 1, 512, 800, 800, 768, 2048] + - [55, 29612.3] + - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] + - [55, 33057.9] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [55, 36158.6] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [24, 38116.5] + - - [768, 4096, 1, 64, 800, 800, 768, 4096] + - [10, 13569.2] + - - [768, 4096, 1, 128, 800, 800, 768, 4096] + - [9, 20596.1] + - - [768, 4096, 1, 256, 800, 800, 768, 4096] + - [20, 28041.9] + - - [768, 4096, 1, 512, 800, 800, 768, 4096] + - [55, 31764.4] + - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] + - [55, 35959.2] + - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] + - [5, 38610.0] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [12, 39510.0] + - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] + - [17, 946.233] + - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] + - [45, 1601.29] + - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] + - [24, 2632.96] + - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] + - [52, 3934.62] + - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] + - [48, 5351.02] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [48, 6619.77] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [48, 7303.71] + - - [1536, 128, 1, 64, 1568, 1568, 1536, 128] + - [15, 1888.76] + - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] + - [52, 3539.99] + - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] + - [39, 6041.49] + - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] + - [52, 8806.94] + - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] + - [52, 11234.8] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [48, 13761.7] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [46, 14106.4] + - - [1536, 256, 1, 64, 1568, 1568, 1536, 256] + - [29, 3585.9] + - - [1536, 256, 1, 128, 1568, 1568, 1536, 256] + - [15, 6423.15] + - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] + - [8, 10526.4] + - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] + - [24, 15462.9] + - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] + - [24, 20284.8] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [22, 24015.4] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [15, 26034.8] + - - [1536, 512, 1, 64, 1568, 1568, 1536, 512] + - [31, 6139.5] + - - [1536, 512, 1, 128, 1568, 1568, 1536, 512] + - [24, 11486.0] + - - [1536, 512, 1, 256, 1568, 1568, 1536, 512] + - [24, 17816.5] + - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] + - [24, 24818.4] + - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] + - [15, 30258.8] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [13, 33568.5] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [6, 36809.0] + - - [1536, 1024, 1, 64, 1568, 1568, 1536, 1024] + - [33, 10476.0] + - - [1536, 1024, 1, 128, 1568, 1568, 1536, 1024] + - [19, 17247.2] + - - [1536, 1024, 1, 256, 1568, 1568, 1536, 1024] + - [24, 24158.7] + - - [1536, 1024, 1, 512, 1568, 1568, 1536, 1024] + - [48, 29792.0] + - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] + - [55, 32800.7] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [24, 36051.4] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [24, 37710.7] + - - [1536, 2048, 1, 64, 1568, 1568, 1536, 2048] + - [29, 13608.7] + - - [1536, 2048, 1, 128, 1568, 1568, 1536, 2048] + - [35, 20714.8] + - - [1536, 2048, 1, 256, 1568, 1568, 1536, 2048] + - [4, 27674.7] + - - [1536, 2048, 1, 512, 1568, 1568, 1536, 2048] + - [24, 31745.0] + - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 2048] + - [55, 36001.8] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [23, 38558.2] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [12, 39599.1] + - - [1536, 4096, 1, 64, 1568, 1568, 1536, 4096] + - [29, 16167.6] + - - [1536, 4096, 1, 128, 1568, 1568, 1536, 4096] + - [55, 23589.1] + - - [1536, 4096, 1, 256, 1568, 1568, 1536, 4096] + - [3, 29528.2] + - - [1536, 4096, 1, 512, 1568, 1568, 1536, 4096] + - [55, 34534.8] + - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 4096] + - [55, 37987.8] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] + - [55, 39263.6] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [21, 39568.3] + - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] + - [12, 1677.05] + - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] + - [6, 2923.55] + - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] + - [23, 4878.52] + - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] + - [14, 7309.82] + - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] + - [7, 9832.81] + - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] + - [13, 11891.0] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [5, 12772.5] + - - [3072, 128, 1, 64, 3104, 3104, 3072, 128] + - [24, 3723.32] + - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] + - [22, 6671.77] + - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] + - [24, 10899.0] + - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] + - [15, 15927.7] + - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] + - [8, 20958.4] + - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] + - [8, 24666.3] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [24, 26183.3] + - - [3072, 256, 1, 64, 3104, 3104, 3072, 256] + - [32, 7056.17] + - - [3072, 256, 1, 128, 3104, 3104, 3072, 256] + - [15, 11948.2] + - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] + - [15, 18416.3] + - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] + - [24, 25338.5] + - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] + - [15, 31568.3] + - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] + - [22, 33824.4] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [6, 37041.4] + - - [3072, 512, 1, 64, 3104, 3104, 3072, 512] + - [34, 10352.1] + - - [3072, 512, 1, 128, 3104, 3104, 3072, 512] + - [9, 16704.9] + - - [3072, 512, 1, 256, 3104, 3104, 3072, 512] + - [15, 23610.5] + - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] + - [20, 30053.3] + - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] + - [6, 32774.0] + - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] + - [15, 36148.1] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [24, 37999.9] + - - [3072, 1024, 1, 64, 3104, 3104, 3072, 1024] + - [19, 13599.5] + - - [3072, 1024, 1, 128, 3104, 3104, 3072, 1024] + - [18, 20720.1] + - - [3072, 1024, 1, 256, 3104, 3104, 3072, 1024] + - [19, 27637.7] + - - [3072, 1024, 1, 512, 3104, 3104, 3072, 1024] + - [55, 32050.1] + - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] + - [55, 36124.6] + - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] + - [23, 38669.3] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [12, 39553.2] + - - [3072, 2048, 1, 64, 3104, 3104, 3072, 2048] + - [35, 16164.3] + - - [3072, 2048, 1, 128, 3104, 3104, 3072, 2048] + - [11, 23552.5] + - - [3072, 2048, 1, 256, 3104, 3104, 3072, 2048] + - [19, 29435.9] + - - [3072, 2048, 1, 512, 3104, 3104, 3072, 2048] + - [55, 34564.0] + - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 2048] + - [55, 37989.5] + - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] + - [55, 39266.9] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [12, 39794.0] + - - [3072, 4096, 1, 64, 3104, 3104, 3072, 4096] + - [30, 17077.5] + - - [3072, 4096, 1, 128, 3104, 3104, 3072, 4096] + - [20, 24680.7] + - - [3072, 4096, 1, 256, 3104, 3104, 3072, 4096] + - [20, 31525.4] + - - [3072, 4096, 1, 512, 3104, 3104, 3072, 4096] + - [55, 36130.5] + - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 4096] + - [55, 38439.8] + - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 4096] + - [55, 38969.9] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [24, 39162.7] + - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] + - [7, 1825.59] + - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] + - [12, 3877.78] + - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] + - [7, 6455.27] + - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] + - [12, 9703.46] + - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] + - [5, 13158.6] + - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] + - [7, 15595.8] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [5, 16933.6] + - - [4096, 128, 1, 64, 4128, 4128, 4096, 128] + - [26, 5161.43] + - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] + - [32, 9529.84] + - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] + - [8, 15379.7] + - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] + - [8, 22033.7] + - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] + - [15, 27741.0] + - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] + - [24, 32030.1] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [8, 33942.7] + - - [4096, 256, 1, 64, 4128, 4128, 4096, 256] + - [29, 8763.26] + - - [4096, 256, 1, 128, 4128, 4128, 4096, 256] + - [15, 14527.3] + - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] + - [15, 21554.2] + - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] + - [24, 27741.0] + - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] + - [13, 33455.2] + - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] + - [13, 36019.6] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [13, 38550.0] + - - [4096, 512, 1, 64, 4128, 4128, 4096, 512] + - [30, 11749.8] + - - [4096, 512, 1, 128, 4128, 4128, 4096, 512] + - [30, 18454.3] + - - [4096, 512, 1, 256, 4128, 4128, 4096, 512] + - [20, 25520.3] + - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] + - [24, 31776.9] + - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] + - [24, 34586.7] + - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] + - [6, 37775.6] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [13, 39494.7] + - - [4096, 1024, 1, 64, 4128, 4128, 4096, 1024] + - [33, 14779.3] + - - [4096, 1024, 1, 128, 4128, 4128, 4096, 1024] + - [19, 22062.7] + - - [4096, 1024, 1, 256, 4128, 4128, 4096, 1024] + - [19, 28909.1] + - - [4096, 1024, 1, 512, 4128, 4128, 4096, 1024] + - [55, 33080.5] + - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] + - [55, 36944.1] + - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] + - [55, 39054.4] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [22, 39165.9] + - - [4096, 2048, 1, 64, 4128, 4128, 4096, 2048] + - [33, 16976.7] + - - [4096, 2048, 1, 128, 4128, 4128, 4096, 2048] + - [20, 23385.0] + - - [4096, 2048, 1, 256, 4128, 4128, 4096, 2048] + - [11, 30381.9] + - - [4096, 2048, 1, 512, 4128, 4128, 4096, 2048] + - [55, 35334.5] + - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 2048] + - [55, 38517.3] + - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] + - [48, 39106.9] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [21, 40079.5] + - - [4096, 4096, 1, 64, 4128, 4128, 4096, 4096] + - [33, 17567.5] + - - [4096, 4096, 1, 128, 4128, 4128, 4096, 4096] + - [20, 25017.0] + - - [4096, 4096, 1, 256, 4128, 4128, 4096, 4096] + - [19, 31303.8] + - - [4096, 4096, 1, 512, 4128, 4128, 4096, 4096] + - [55, 36382.3] + - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 4096] + - [55, 38332.7] + - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 4096] + - [21, 39483.2] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [24, 39187.2] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_BBS_BH_GB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_BBS_BH_GB.yaml new file mode 100644 index 00000000000..ce2e15aa4a7 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_BBS_BH_GB.yaml @@ -0,0 +1,16503 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW4_PLR1_SIA2_SVW1_VW1_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW8_PLR1_SIA2_SVW1_VW1_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW4_PLR1_SIA2_SVW1_VW1_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW8_PLR1_SIA2_SVW1_VW1_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW4_PLR1_SIA2_SVW1_VW1_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS1_GRVW8_PLR1_SIA2_SVW1_VW1_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_EPS0_GRVW8_PLR1_SIA3_SVW1_VW1_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS1_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS1_GRVW8_PLR0_SIA1_SVW1_VW1_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS1_GRVW8_PLR0_SIA1_SVW1_VW1_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS0_GRVW4_PLR0_SIA1_SVW1_VW1_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_EPS1_GRVW4_PLR0_SIA1_SVW1_VW1_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_AMAS3_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [2, 30.7249] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [26, 51.6133] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [0, 87.5345] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [39, 141.834] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [39, 202.673] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [39, 260.064] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [42, 292.868] + - - [64, 128, 1, 64, 96, 96, 96, 128] + - [2, 54.3585] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [27, 99.4288] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [39, 172.975] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [39, 277.622] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [51, 400.909] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [39, 519.435] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [41, 584.949] + - - [64, 256, 1, 64, 96, 96, 96, 256] + - [26, 126.107] + - - [64, 256, 1, 128, 96, 96, 160, 256] + - [52, 236.646] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [39, 407.61] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [39, 633.485] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [39, 874.407] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [45, 1087.56] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [48, 1204.61] + - - [64, 512, 1, 64, 96, 96, 96, 512] + - [9, 272.145] + - - [64, 512, 1, 128, 96, 96, 160, 512] + - [37, 495.722] + - - [64, 512, 1, 256, 96, 96, 288, 512] + - [45, 844.776] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [39, 1307.76] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [53, 1801.19] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [42, 2232.65] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [41, 2441.21] + - - [64, 1024, 1, 64, 96, 96, 96, 1024] + - [39, 567.718] + - - [64, 1024, 1, 128, 96, 96, 160, 1024] + - [43, 1078.37] + - - [64, 1024, 1, 256, 96, 96, 288, 1024] + - [52, 1809.06] + - - [64, 1024, 1, 512, 96, 96, 544, 1024] + - [40, 2774.47] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [39, 3752.04] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [43, 4494.46] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [48, 4962.21] + - - [64, 2048, 1, 64, 96, 96, 96, 2048] + - [43, 1198.37] + - - [64, 2048, 1, 128, 96, 96, 160, 2048] + - [39, 2173.5] + - - [64, 2048, 1, 256, 96, 96, 288, 2048] + - [37, 3615.39] + - - [64, 2048, 1, 512, 96, 96, 544, 2048] + - [52, 5323.99] + - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] + - [52, 7465.67] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [44, 9106.61] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [40, 9173.37] + - - [64, 4096, 1, 64, 96, 96, 96, 4096] + - [5, 2062.6] + - - [64, 4096, 1, 128, 96, 96, 160, 4096] + - [22, 3754.97] + - - [64, 4096, 1, 256, 96, 96, 288, 4096] + - [24, 6284.81] + - - [64, 4096, 1, 512, 96, 96, 544, 4096] + - [7, 9496.76] + - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] + - [5, 13129.0] + - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] + - [23, 15629.4] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [24, 16788.0] + - - [128, 64, 1, 64, 160, 160, 128, 96] + - [2, 62.2892] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [57, 99.7888] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [45, 176.202] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [39, 278.008] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [45, 402.467] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [51, 519.017] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [41, 582.644] + - - [128, 128, 1, 64, 160, 160, 128, 128] + - [28, 135.51] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [34, 265.16] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [52, 439.794] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [15, 650.331] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [52, 905.116] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [45, 1111.77] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [39, 1214.05] + - - [128, 256, 1, 64, 160, 160, 128, 256] + - [28, 333.305] + - - [128, 256, 1, 128, 160, 160, 160, 256] + - [45, 589.502] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [45, 979.297] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [45, 1456.86] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [39, 1914.34] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [52, 2307.1] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [55, 2480.05] + - - [128, 512, 1, 64, 160, 160, 128, 512] + - [25, 678.032] + - - [128, 512, 1, 128, 160, 160, 160, 512] + - [45, 1202.84] + - - [128, 512, 1, 256, 160, 160, 288, 512] + - [45, 1992.32] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [43, 3033.85] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [39, 4018.98] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [46, 4681.47] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [42, 5033.67] + - - [128, 1024, 1, 64, 160, 160, 128, 1024] + - [45, 1386.56] + - - [128, 1024, 1, 128, 160, 160, 160, 1024] + - [52, 2617.35] + - - [128, 1024, 1, 256, 160, 160, 288, 1024] + - [52, 4254.4] + - - [128, 1024, 1, 512, 160, 160, 544, 1024] + - [45, 5978.52] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [42, 8119.64] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [45, 9423.09] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [42, 10243.1] + - - [128, 2048, 1, 64, 160, 160, 128, 2048] + - [6, 2722.25] + - - [128, 2048, 1, 128, 160, 160, 160, 2048] + - [45, 4679.18] + - - [128, 2048, 1, 256, 160, 160, 288, 2048] + - [48, 7688.04] + - - [128, 2048, 1, 512, 160, 160, 544, 2048] + - [42, 11409.2] + - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] + - [39, 15393.7] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [46, 18850.2] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [1, 16319.5] + - - [128, 4096, 1, 64, 160, 160, 128, 4096] + - [24, 4496.71] + - - [128, 4096, 1, 128, 160, 160, 160, 4096] + - [15, 8042.79] + - - [128, 4096, 1, 256, 160, 160, 288, 4096] + - [8, 13359.0] + - - [128, 4096, 1, 512, 160, 160, 544, 4096] + - [23, 20091.0] + - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] + - [24, 27069.6] + - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] + - [8, 32463.8] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [24, 32971.3] + - - [256, 64, 1, 64, 288, 288, 256, 96] + - [2, 139.346] + - - [256, 64, 1, 128, 288, 288, 256, 160] + - [39, 229.147] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [39, 394.053] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [52, 630.344] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [38, 875.319] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [39, 1085.69] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [42, 1199.89] + - - [256, 128, 1, 64, 288, 288, 256, 128] + - [33, 314.274] + - - [256, 128, 1, 128, 288, 288, 256, 160] + - [28, 592.249] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [52, 978.492] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [39, 1416.64] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [52, 1946.54] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [52, 2302.19] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [55, 2497.22] + - - [256, 256, 1, 64, 288, 288, 256, 256] + - [9, 679.022] + - - [256, 256, 1, 128, 288, 288, 256, 256] + - [39, 1206.13] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [45, 1972.63] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [50, 2934.11] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [48, 4008.9] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [48, 4689.49] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [42, 5033.58] + - - [256, 512, 1, 64, 288, 288, 256, 512] + - [52, 1464.24] + - - [256, 512, 1, 128, 288, 288, 256, 512] + - [45, 2606.37] + - - [256, 512, 1, 256, 288, 288, 288, 512] + - [48, 4221.21] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [55, 5979.05] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [48, 7924.53] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [39, 9532.18] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [42, 10030.3] + - - [256, 1024, 1, 64, 288, 288, 256, 1024] + - [13, 2770.35] + - - [256, 1024, 1, 128, 288, 288, 256, 1024] + - [45, 4902.04] + - - [256, 1024, 1, 256, 288, 288, 288, 1024] + - [13, 7787.98] + - - [256, 1024, 1, 512, 288, 288, 544, 1024] + - [46, 11479.4] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [42, 15410.5] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [55, 18775.0] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [55, 20118.8] + - - [256, 2048, 1, 64, 288, 288, 256, 2048] + - [33, 4644.87] + - - [256, 2048, 1, 128, 288, 288, 256, 2048] + - [6, 8300.44] + - - [256, 2048, 1, 256, 288, 288, 288, 2048] + - [24, 13617.9] + - - [256, 2048, 1, 512, 288, 288, 544, 2048] + - [15, 20187.7] + - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] + - [15, 26389.7] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [7, 31808.0] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [8, 34198.9] + - - [256, 4096, 1, 64, 288, 288, 256, 4096] + - [26, 8185.01] + - - [256, 4096, 1, 128, 288, 288, 256, 4096] + - [8, 14367.1] + - - [256, 4096, 1, 256, 288, 288, 288, 4096] + - [7, 21442.2] + - - [256, 4096, 1, 512, 288, 288, 544, 4096] + - [13, 27799.9] + - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] + - [6, 33428.1] + - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] + - [6, 36071.0] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [13, 38653.0] + - - [384, 64, 1, 64, 416, 416, 384, 96] + - [2, 215.196] + - - [384, 64, 1, 128, 416, 416, 384, 160] + - [45, 358.733] + - - [384, 64, 1, 256, 416, 416, 384, 288] + - [45, 610.88] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [49, 945.16] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [39, 1351.04] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [41, 1636.59] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [48, 1809.91] + - - [384, 128, 1, 64, 416, 416, 384, 128] + - [18, 486.881] + - - [384, 128, 1, 128, 416, 416, 384, 160] + - [52, 876.858] + - - [384, 128, 1, 256, 416, 416, 384, 288] + - [39, 1448.81] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [52, 2157.38] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [52, 2900.63] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [42, 3499.39] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [55, 3726.55] + - - [384, 256, 1, 64, 416, 416, 384, 256] + - [56, 1086.42] + - - [384, 256, 1, 128, 416, 416, 384, 256] + - [39, 1929.01] + - - [384, 256, 1, 256, 416, 416, 384, 288] + - [48, 3010.28] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [39, 4460.44] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [39, 5901.59] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [55, 7120.05] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [48, 7472.88] + - - [384, 512, 1, 64, 416, 416, 384, 512] + - [55, 2105.93] + - - [384, 512, 1, 128, 416, 416, 384, 512] + - [37, 3720.55] + - - [384, 512, 1, 256, 416, 416, 384, 512] + - [39, 6111.18] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [55, 8732.11] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [48, 11650.8] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [54, 13892.7] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [46, 14912.5] + - - [384, 1024, 1, 64, 416, 416, 384, 1024] + - [15, 3460.19] + - - [384, 1024, 1, 128, 416, 416, 384, 1024] + - [29, 6184.79] + - - [384, 1024, 1, 256, 416, 416, 384, 1024] + - [24, 10509.8] + - - [384, 1024, 1, 512, 416, 416, 544, 1024] + - [24, 15533.3] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [15, 20049.5] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [22, 23903.4] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [15, 25873.3] + - - [384, 2048, 1, 64, 416, 416, 384, 2048] + - [33, 5974.08] + - - [384, 2048, 1, 128, 416, 416, 384, 2048] + - [24, 11479.4] + - - [384, 2048, 1, 256, 416, 416, 384, 2048] + - [15, 17897.3] + - - [384, 2048, 1, 512, 416, 416, 544, 2048] + - [8, 24819.9] + - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] + - [6, 30470.6] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [23, 33605.6] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [6, 36697.9] + - - [384, 4096, 1, 64, 416, 416, 384, 4096] + - [33, 10317.0] + - - [384, 4096, 1, 128, 416, 416, 384, 4096] + - [19, 16366.7] + - - [384, 4096, 1, 256, 416, 416, 384, 4096] + - [52, 23518.1] + - - [384, 4096, 1, 512, 416, 416, 544, 4096] + - [55, 29611.2] + - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] + - [55, 32562.0] + - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] + - [48, 35377.9] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [42, 36190.9] + - - [768, 64, 1, 64, 800, 800, 768, 96] + - [18, 469.512] + - - [768, 64, 1, 128, 800, 800, 768, 160] + - [37, 787.022] + - - [768, 64, 1, 256, 800, 800, 768, 288] + - [24, 1303.66] + - - [768, 64, 1, 512, 800, 800, 768, 544] + - [39, 1976.27] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [48, 2740.63] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [48, 3280.97] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [47, 3603.29] + - - [768, 128, 1, 64, 800, 800, 768, 128] + - [9, 970.454] + - - [768, 128, 1, 128, 800, 800, 768, 160] + - [43, 1825.46] + - - [768, 128, 1, 256, 800, 800, 768, 288] + - [52, 2986.33] + - - [768, 128, 1, 512, 800, 800, 768, 544] + - [45, 4419.72] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [48, 5939.9] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [48, 6919.87] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [48, 7420.68] + - - [768, 256, 1, 64, 800, 800, 768, 256] + - [29, 2114.06] + - - [768, 256, 1, 128, 800, 800, 768, 256] + - [8, 3705.22] + - - [768, 256, 1, 256, 800, 800, 768, 288] + - [45, 5881.93] + - - [768, 256, 1, 512, 800, 800, 768, 544] + - [45, 8712.44] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [39, 11451.4] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [42, 13812.2] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [55, 14830.2] + - - [768, 512, 1, 64, 800, 800, 768, 512] + - [15, 3621.5] + - - [768, 512, 1, 128, 800, 800, 768, 512] + - [13, 6408.43] + - - [768, 512, 1, 256, 800, 800, 768, 512] + - [8, 10534.0] + - - [768, 512, 1, 512, 800, 800, 768, 544] + - [42, 15170.4] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [24, 20003.6] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [24, 24272.4] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [8, 25795.0] + - - [768, 1024, 1, 64, 800, 800, 768, 1024] + - [29, 6152.28] + - - [768, 1024, 1, 128, 800, 800, 768, 1024] + - [16, 10627.5] + - - [768, 1024, 1, 256, 800, 800, 768, 1024] + - [24, 17379.8] + - - [768, 1024, 1, 512, 800, 800, 768, 1024] + - [22, 24322.1] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [22, 30392.4] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [13, 33606.3] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [22, 36829.2] + - - [768, 2048, 1, 64, 800, 800, 768, 2048] + - [36, 10766.1] + - - [768, 2048, 1, 128, 800, 800, 768, 2048] + - [19, 17115.3] + - - [768, 2048, 1, 256, 800, 800, 768, 2048] + - [10, 24102.3] + - - [768, 2048, 1, 512, 800, 800, 768, 2048] + - [55, 29612.3] + - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] + - [55, 33057.9] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [55, 36158.6] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [24, 38116.5] + - - [768, 4096, 1, 64, 800, 800, 768, 4096] + - [10, 13569.2] + - - [768, 4096, 1, 128, 800, 800, 768, 4096] + - [9, 20596.1] + - - [768, 4096, 1, 256, 800, 800, 768, 4096] + - [20, 28041.9] + - - [768, 4096, 1, 512, 800, 800, 768, 4096] + - [55, 31764.4] + - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] + - [55, 35959.2] + - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] + - [5, 38610.0] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [12, 39510.0] + - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] + - [17, 946.233] + - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] + - [45, 1601.29] + - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] + - [24, 2632.96] + - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] + - [52, 3934.62] + - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] + - [48, 5351.02] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [48, 6619.77] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [48, 7303.71] + - - [1536, 128, 1, 64, 1568, 1568, 1536, 128] + - [15, 1888.76] + - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] + - [52, 3539.99] + - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] + - [39, 6041.49] + - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] + - [52, 8806.94] + - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] + - [52, 11234.8] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [48, 13761.7] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [46, 14106.4] + - - [1536, 256, 1, 64, 1568, 1568, 1536, 256] + - [29, 3585.9] + - - [1536, 256, 1, 128, 1568, 1568, 1536, 256] + - [15, 6423.15] + - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] + - [8, 10526.4] + - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] + - [24, 15462.9] + - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] + - [24, 20284.8] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [22, 24015.4] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [15, 26034.8] + - - [1536, 512, 1, 64, 1568, 1568, 1536, 512] + - [31, 6139.5] + - - [1536, 512, 1, 128, 1568, 1568, 1536, 512] + - [24, 11486.0] + - - [1536, 512, 1, 256, 1568, 1568, 1536, 512] + - [24, 17816.5] + - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] + - [24, 24818.4] + - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] + - [15, 30258.8] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [13, 33568.5] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [6, 36809.0] + - - [1536, 1024, 1, 64, 1568, 1568, 1536, 1024] + - [33, 10476.0] + - - [1536, 1024, 1, 128, 1568, 1568, 1536, 1024] + - [19, 17247.2] + - - [1536, 1024, 1, 256, 1568, 1568, 1536, 1024] + - [24, 24158.7] + - - [1536, 1024, 1, 512, 1568, 1568, 1536, 1024] + - [48, 29792.0] + - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] + - [55, 32800.7] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [24, 36051.4] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [24, 37710.7] + - - [1536, 2048, 1, 64, 1568, 1568, 1536, 2048] + - [29, 13608.7] + - - [1536, 2048, 1, 128, 1568, 1568, 1536, 2048] + - [35, 20714.8] + - - [1536, 2048, 1, 256, 1568, 1568, 1536, 2048] + - [4, 27674.7] + - - [1536, 2048, 1, 512, 1568, 1568, 1536, 2048] + - [24, 31745.0] + - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 2048] + - [55, 36001.8] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [23, 38558.2] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [12, 39599.1] + - - [1536, 4096, 1, 64, 1568, 1568, 1536, 4096] + - [29, 16167.6] + - - [1536, 4096, 1, 128, 1568, 1568, 1536, 4096] + - [55, 23589.1] + - - [1536, 4096, 1, 256, 1568, 1568, 1536, 4096] + - [3, 29528.2] + - - [1536, 4096, 1, 512, 1568, 1568, 1536, 4096] + - [55, 34534.8] + - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 4096] + - [55, 37987.8] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] + - [55, 39263.6] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [21, 39568.3] + - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] + - [12, 1677.05] + - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] + - [6, 2923.55] + - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] + - [23, 4878.52] + - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] + - [14, 7309.82] + - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] + - [7, 9832.81] + - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] + - [13, 11891.0] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [5, 12772.5] + - - [3072, 128, 1, 64, 3104, 3104, 3072, 128] + - [24, 3723.32] + - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] + - [22, 6671.77] + - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] + - [24, 10899.0] + - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] + - [15, 15927.7] + - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] + - [8, 20958.4] + - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] + - [8, 24666.3] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [24, 26183.3] + - - [3072, 256, 1, 64, 3104, 3104, 3072, 256] + - [32, 7056.17] + - - [3072, 256, 1, 128, 3104, 3104, 3072, 256] + - [15, 11948.2] + - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] + - [15, 18416.3] + - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] + - [24, 25338.5] + - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] + - [15, 31568.3] + - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] + - [22, 33824.4] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [6, 37041.4] + - - [3072, 512, 1, 64, 3104, 3104, 3072, 512] + - [34, 10352.1] + - - [3072, 512, 1, 128, 3104, 3104, 3072, 512] + - [9, 16704.9] + - - [3072, 512, 1, 256, 3104, 3104, 3072, 512] + - [15, 23610.5] + - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] + - [20, 30053.3] + - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] + - [6, 32774.0] + - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] + - [15, 36148.1] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [24, 37999.9] + - - [3072, 1024, 1, 64, 3104, 3104, 3072, 1024] + - [19, 13599.5] + - - [3072, 1024, 1, 128, 3104, 3104, 3072, 1024] + - [18, 20720.1] + - - [3072, 1024, 1, 256, 3104, 3104, 3072, 1024] + - [19, 27637.7] + - - [3072, 1024, 1, 512, 3104, 3104, 3072, 1024] + - [55, 32050.1] + - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] + - [55, 36124.6] + - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] + - [23, 38669.3] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [12, 39553.2] + - - [3072, 2048, 1, 64, 3104, 3104, 3072, 2048] + - [35, 16164.3] + - - [3072, 2048, 1, 128, 3104, 3104, 3072, 2048] + - [11, 23552.5] + - - [3072, 2048, 1, 256, 3104, 3104, 3072, 2048] + - [19, 29435.9] + - - [3072, 2048, 1, 512, 3104, 3104, 3072, 2048] + - [55, 34564.0] + - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 2048] + - [55, 37989.5] + - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] + - [55, 39266.9] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [12, 39794.0] + - - [3072, 4096, 1, 64, 3104, 3104, 3072, 4096] + - [30, 17077.5] + - - [3072, 4096, 1, 128, 3104, 3104, 3072, 4096] + - [20, 24680.7] + - - [3072, 4096, 1, 256, 3104, 3104, 3072, 4096] + - [20, 31525.4] + - - [3072, 4096, 1, 512, 3104, 3104, 3072, 4096] + - [55, 36130.5] + - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 4096] + - [55, 38439.8] + - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 4096] + - [55, 38969.9] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [24, 39162.7] + - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] + - [7, 1825.59] + - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] + - [12, 3877.78] + - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] + - [7, 6455.27] + - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] + - [12, 9703.46] + - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] + - [5, 13158.6] + - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] + - [7, 15595.8] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [5, 16933.6] + - - [4096, 128, 1, 64, 4128, 4128, 4096, 128] + - [26, 5161.43] + - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] + - [32, 9529.84] + - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] + - [8, 15379.7] + - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] + - [8, 22033.7] + - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] + - [15, 27741.0] + - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] + - [24, 32030.1] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [8, 33942.7] + - - [4096, 256, 1, 64, 4128, 4128, 4096, 256] + - [29, 8763.26] + - - [4096, 256, 1, 128, 4128, 4128, 4096, 256] + - [15, 14527.3] + - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] + - [15, 21554.2] + - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] + - [24, 27741.0] + - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] + - [13, 33455.2] + - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] + - [13, 36019.6] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [13, 38550.0] + - - [4096, 512, 1, 64, 4128, 4128, 4096, 512] + - [30, 11749.8] + - - [4096, 512, 1, 128, 4128, 4128, 4096, 512] + - [30, 18454.3] + - - [4096, 512, 1, 256, 4128, 4128, 4096, 512] + - [20, 25520.3] + - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] + - [24, 31776.9] + - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] + - [24, 34586.7] + - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] + - [6, 37775.6] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [13, 39494.7] + - - [4096, 1024, 1, 64, 4128, 4128, 4096, 1024] + - [33, 14779.3] + - - [4096, 1024, 1, 128, 4128, 4128, 4096, 1024] + - [19, 22062.7] + - - [4096, 1024, 1, 256, 4128, 4128, 4096, 1024] + - [19, 28909.1] + - - [4096, 1024, 1, 512, 4128, 4128, 4096, 1024] + - [55, 33080.5] + - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] + - [55, 36944.1] + - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] + - [55, 39054.4] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [22, 39165.9] + - - [4096, 2048, 1, 64, 4128, 4128, 4096, 2048] + - [33, 16976.7] + - - [4096, 2048, 1, 128, 4128, 4128, 4096, 2048] + - [20, 23385.0] + - - [4096, 2048, 1, 256, 4128, 4128, 4096, 2048] + - [11, 30381.9] + - - [4096, 2048, 1, 512, 4128, 4128, 4096, 2048] + - [55, 35334.5] + - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 2048] + - [55, 38517.3] + - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] + - [48, 39106.9] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [21, 40079.5] + - - [4096, 4096, 1, 64, 4128, 4128, 4096, 4096] + - [33, 17567.5] + - - [4096, 4096, 1, 128, 4128, 4128, 4096, 4096] + - [20, 25017.0] + - - [4096, 4096, 1, 256, 4128, 4128, 4096, 4096] + - [19, 31303.8] + - - [4096, 4096, 1, 512, 4128, 4128, 4096, 4096] + - [55, 36382.3] + - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 4096] + - [55, 38332.7] + - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 4096] + - [21, 39483.2] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [24, 39187.2] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_HB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_HB.yaml new file mode 100644 index 00000000000..7d392215115 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_HB.yaml @@ -0,0 +1,17853 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [3, 34.9153] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [11, 58.2736] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [33, 98.8013] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [45, 152.299] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [46, 214.598] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [46, 271.239] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [45, 301.196] + - - [64, 128, 1, 64, 96, 96, 96, 128] + - [9, 61.71] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [29, 112.412] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [11, 191.381] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [46, 298.633] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [51, 426.641] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [61, 540.086] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [46, 602.122] + - - [64, 256, 1, 64, 96, 96, 96, 256] + - [29, 149.2] + - - [64, 256, 1, 128, 96, 96, 160, 256] + - [32, 268.212] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [46, 448.829] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [46, 665.763] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [45, 922.18] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [56, 1138.1] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [46, 1240.21] + - - [64, 512, 1, 64, 96, 96, 96, 512] + - [23, 329.223] + - - [64, 512, 1, 128, 96, 96, 160, 512] + - [46, 581.573] + - - [64, 512, 1, 256, 96, 96, 288, 512] + - [45, 963.322] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [52, 1406.19] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [46, 1925.32] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [61, 2332.03] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [52, 2525.07] + - - [64, 1024, 1, 64, 96, 96, 96, 1024] + - [46, 644.385] + - - [64, 1024, 1, 128, 96, 96, 160, 1024] + - [45, 1150.23] + - - [64, 1024, 1, 256, 96, 96, 288, 1024] + - [59, 1932.87] + - - [64, 1024, 1, 512, 96, 96, 544, 1024] + - [60, 2905.15] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [59, 3908.95] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [59, 4710.06] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [51, 5090.37] + - - [64, 2048, 1, 64, 96, 96, 96, 2048] + - [16, 1293.74] + - - [64, 2048, 1, 128, 96, 96, 160, 2048] + - [55, 2312.82] + - - [64, 2048, 1, 256, 96, 96, 288, 2048] + - [53, 3847.1] + - - [64, 2048, 1, 512, 96, 96, 544, 2048] + - [57, 5772.31] + - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] + - [60, 7770.84] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [55, 9587.67] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [0, 8013.37] + - - [64, 4096, 1, 64, 96, 96, 96, 4096] + - [24, 2399.14] + - - [64, 4096, 1, 128, 96, 96, 160, 4096] + - [18, 4148.16] + - - [64, 4096, 1, 256, 96, 96, 288, 4096] + - [40, 6804.79] + - - [64, 4096, 1, 512, 96, 96, 544, 4096] + - [18, 10149.6] + - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] + - [5, 13540.3] + - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] + - [32, 16308.4] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [10, 17122.4] + - - [128, 64, 1, 64, 160, 160, 128, 96] + - [9, 70.167] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [15, 121.857] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [8, 197.882] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [8, 303.913] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [55, 424.934] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [51, 539.305] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [55, 604.454] + - - [128, 128, 1, 64, 160, 160, 128, 128] + - [27, 168.392] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [1, 289.183] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [19, 474.576] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [52, 700.57] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [52, 957.057] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [48, 1167.11] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [62, 1248.14] + - - [128, 256, 1, 64, 160, 160, 128, 256] + - [62, 366.635] + - - [128, 256, 1, 128, 160, 160, 160, 256] + - [8, 653.726] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [59, 1055.57] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [52, 1546.0] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [46, 2044.51] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [45, 2397.95] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [59, 2538.78] + - - [128, 512, 1, 64, 160, 160, 128, 512] + - [62, 776.867] + - - [128, 512, 1, 128, 160, 160, 160, 512] + - [56, 1360.69] + - - [128, 512, 1, 256, 160, 160, 288, 512] + - [46, 2213.94] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [49, 3190.8] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [52, 4095.25] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [61, 4822.78] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [56, 5149.25] + - - [128, 1024, 1, 64, 160, 160, 128, 1024] + - [33, 1599.96] + - - [128, 1024, 1, 128, 160, 160, 160, 1024] + - [46, 2781.37] + - - [128, 1024, 1, 256, 160, 160, 288, 1024] + - [11, 4393.67] + - - [128, 1024, 1, 512, 160, 160, 544, 1024] + - [44, 6307.83] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [59, 8252.44] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [58, 9670.92] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [59, 10377.1] + - - [128, 2048, 1, 64, 160, 160, 128, 2048] + - [59, 2794.81] + - - [128, 2048, 1, 128, 160, 160, 160, 2048] + - [52, 4949.03] + - - [128, 2048, 1, 256, 160, 160, 288, 2048] + - [59, 8096.18] + - - [128, 2048, 1, 512, 160, 160, 544, 2048] + - [50, 11933.7] + - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] + - [47, 15958.4] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [58, 19716.9] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [47, 18578.1] + - - [128, 4096, 1, 64, 160, 160, 128, 4096] + - [25, 5083.23] + - - [128, 4096, 1, 128, 160, 160, 160, 4096] + - [19, 8912.2] + - - [128, 4096, 1, 256, 160, 160, 288, 4096] + - [8, 14500.7] + - - [128, 4096, 1, 512, 160, 160, 544, 4096] + - [30, 21445.7] + - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] + - [11, 28067.3] + - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] + - [24, 32858.3] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [33, 33553.4] + - - [256, 64, 1, 64, 288, 288, 256, 96] + - [29, 157.728] + - - [256, 64, 1, 128, 288, 288, 256, 160] + - [15, 274.892] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [46, 431.027] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [49, 661.562] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [46, 920.914] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [61, 1146.42] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [46, 1233.98] + - - [256, 128, 1, 64, 288, 288, 256, 128] + - [19, 365.485] + - - [256, 128, 1, 128, 288, 288, 256, 160] + - [11, 653.522] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [59, 1058.5] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [46, 1543.3] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [46, 2041.28] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [59, 2387.62] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [59, 2536.38] + - - [256, 256, 1, 64, 288, 288, 256, 256] + - [33, 787.663] + - - [256, 256, 1, 128, 288, 288, 256, 256] + - [4, 1365.57] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [52, 2210.15] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [59, 3105.76] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [52, 4094.0] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [56, 4833.37] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [52, 5127.12] + - - [256, 512, 1, 64, 288, 288, 256, 512] + - [11, 1600.88] + - - [256, 512, 1, 128, 288, 288, 256, 512] + - [59, 2655.04] + - - [256, 512, 1, 256, 288, 288, 288, 512] + - [46, 4520.35] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [59, 6483.34] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [59, 8388.61] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [54, 9743.58] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [59, 10426.1] + - - [256, 1024, 1, 64, 288, 288, 256, 1024] + - [46, 2973.1] + - - [256, 1024, 1, 128, 288, 288, 256, 1024] + - [59, 5197.4] + - - [256, 1024, 1, 256, 288, 288, 288, 1024] + - [46, 8447.77] + - - [256, 1024, 1, 512, 288, 288, 544, 1024] + - [44, 12321.5] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [59, 16261.9] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [49, 19341.1] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [46, 15925.5] + - - [256, 2048, 1, 64, 288, 288, 256, 2048] + - [31, 5006.65] + - - [256, 2048, 1, 128, 288, 288, 256, 2048] + - [31, 8861.6] + - - [256, 2048, 1, 256, 288, 288, 288, 2048] + - [19, 14450.7] + - - [256, 2048, 1, 512, 288, 288, 544, 2048] + - [19, 21171.7] + - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] + - [11, 27292.5] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [8, 32448.2] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [19, 34710.7] + - - [256, 4096, 1, 64, 288, 288, 256, 4096] + - [46, 8869.79] + - - [256, 4096, 1, 128, 288, 288, 256, 4096] + - [59, 14643.0] + - - [256, 4096, 1, 256, 288, 288, 288, 4096] + - [25, 23511.9] + - - [256, 4096, 1, 512, 288, 288, 544, 4096] + - [25, 30297.5] + - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] + - [7, 35413.7] + - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] + - [7, 37152.4] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [11, 39461.3] + - - [384, 64, 1, 64, 416, 416, 384, 96] + - [27, 244.006] + - - [384, 64, 1, 128, 416, 416, 384, 160] + - [8, 399.865] + - - [384, 64, 1, 256, 416, 416, 384, 288] + - [59, 690.762] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [46, 1051.91] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [59, 1401.06] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [59, 1707.84] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [60, 1858.73] + - - [384, 128, 1, 64, 416, 416, 384, 128] + - [8, 557.756] + - - [384, 128, 1, 128, 416, 416, 384, 160] + - [11, 999.278] + - - [384, 128, 1, 256, 416, 416, 384, 288] + - [59, 1553.64] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [46, 2289.47] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [59, 3020.93] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [52, 3573.55] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [50, 3783.91] + - - [384, 256, 1, 64, 416, 416, 384, 256] + - [33, 1127.91] + - - [384, 256, 1, 128, 416, 416, 384, 256] + - [56, 1963.63] + - - [384, 256, 1, 256, 416, 416, 384, 288] + - [46, 3210.75] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [46, 4681.59] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [46, 6143.62] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [50, 7282.05] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [44, 7603.12] + - - [384, 512, 1, 64, 416, 416, 384, 512] + - [8, 2291.13] + - - [384, 512, 1, 128, 416, 416, 384, 512] + - [59, 3987.63] + - - [384, 512, 1, 256, 416, 416, 384, 512] + - [49, 6472.69] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [52, 9422.76] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [59, 12321.1] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [49, 14284.6] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [62, 15235.6] + - - [384, 1024, 1, 64, 416, 416, 384, 1024] + - [31, 3940.16] + - - [384, 1024, 1, 128, 416, 416, 384, 1024] + - [33, 6951.91] + - - [384, 1024, 1, 256, 416, 416, 384, 1024] + - [11, 11229.7] + - - [384, 1024, 1, 512, 416, 416, 544, 1024] + - [8, 15833.8] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [41, 20747.9] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [40, 24599.3] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [11, 26273.9] + - - [384, 2048, 1, 64, 416, 416, 384, 2048] + - [59, 6458.57] + - - [384, 2048, 1, 128, 416, 416, 384, 2048] + - [36, 11091.2] + - - [384, 2048, 1, 256, 416, 416, 384, 2048] + - [59, 18470.3] + - - [384, 2048, 1, 512, 416, 416, 544, 2048] + - [31, 25392.8] + - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] + - [41, 31675.0] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [8, 34049.6] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [41, 37367.5] + - - [384, 4096, 1, 64, 416, 416, 384, 4096] + - [42, 12273.1] + - - [384, 4096, 1, 128, 416, 416, 384, 4096] + - [59, 19450.0] + - - [384, 4096, 1, 256, 416, 416, 384, 4096] + - [59, 26313.8] + - - [384, 4096, 1, 512, 416, 416, 544, 4096] + - [52, 31963.0] + - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] + - [46, 33835.0] + - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] + - [46, 36773.3] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [48, 36908.1] + - - [768, 64, 1, 64, 800, 800, 768, 96] + - [2, 535.717] + - - [768, 64, 1, 128, 800, 800, 768, 160] + - [12, 918.863] + - - [768, 64, 1, 256, 800, 800, 768, 288] + - [45, 1455.34] + - - [768, 64, 1, 512, 800, 800, 768, 544] + - [55, 2118.34] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [45, 2845.37] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [58, 3419.04] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [51, 3699.5] + - - [768, 128, 1, 64, 800, 800, 768, 128] + - [33, 1096.65] + - - [768, 128, 1, 128, 800, 800, 768, 160] + - [33, 1963.63] + - - [768, 128, 1, 256, 800, 800, 768, 288] + - [59, 3191.2] + - - [768, 128, 1, 512, 800, 800, 768, 544] + - [46, 4675.07] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [46, 6065.16] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [59, 7186.65] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [59, 7484.55] + - - [768, 256, 1, 64, 800, 800, 768, 256] + - [6, 2168.72] + - - [768, 256, 1, 128, 800, 800, 768, 256] + - [49, 3992.04] + - - [768, 256, 1, 256, 800, 800, 768, 288] + - [59, 6497.76] + - - [768, 256, 1, 512, 800, 800, 768, 544] + - [59, 9469.76] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [45, 12084.4] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [51, 14038.5] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [62, 15070.8] + - - [768, 512, 1, 64, 800, 800, 768, 512] + - [33, 3800.34] + - - [768, 512, 1, 128, 800, 800, 768, 512] + - [31, 6970.18] + - - [768, 512, 1, 256, 800, 800, 768, 512] + - [33, 11278.8] + - - [768, 512, 1, 512, 800, 800, 768, 544] + - [8, 16259.7] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [19, 20658.4] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [11, 24612.1] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [41, 26264.9] + - - [768, 1024, 1, 64, 800, 800, 768, 1024] + - [36, 6458.57] + - - [768, 1024, 1, 128, 800, 800, 768, 1024] + - [21, 11061.9] + - - [768, 1024, 1, 256, 800, 800, 768, 1024] + - [59, 18391.0] + - - [768, 1024, 1, 512, 800, 800, 768, 1024] + - [41, 25383.2] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [41, 31530.0] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [11, 34229.0] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [11, 37142.6] + - - [768, 2048, 1, 64, 800, 800, 768, 2048] + - [29, 12690.8] + - - [768, 2048, 1, 128, 800, 800, 768, 2048] + - [27, 19917.5] + - - [768, 2048, 1, 256, 800, 800, 768, 2048] + - [29, 26832.9] + - - [768, 2048, 1, 512, 800, 800, 768, 2048] + - [15, 32077.6] + - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] + - [59, 34264.7] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [59, 37386.6] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [52, 37998.9] + - - [768, 4096, 1, 64, 800, 800, 768, 4096] + - [27, 18155.5] + - - [768, 4096, 1, 128, 800, 800, 768, 4096] + - [29, 25552.3] + - - [768, 4096, 1, 256, 800, 800, 768, 4096] + - [59, 31741.2] + - - [768, 4096, 1, 512, 800, 800, 768, 4096] + - [59, 34249.4] + - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] + - [41, 37706.9] + - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] + - [11, 39802.2] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [6, 40179.8] + - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] + - [14, 1081.01] + - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] + - [12, 1851.79] + - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] + - [45, 2932.4] + - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] + - [61, 4234.18] + - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] + - [51, 5783.58] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [62, 6876.85] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [53, 6858.93] + - - [1536, 128, 1, 64, 1568, 1568, 1536, 128] + - [19, 2272.1] + - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] + - [33, 3979.42] + - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] + - [59, 6451.14] + - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] + - [44, 9027.29] + - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] + - [45, 11830.9] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [46, 14157.5] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [33, 13835.5] + - - [1536, 256, 1, 64, 1568, 1568, 1536, 256] + - [31, 3762.28] + - - [1536, 256, 1, 128, 1568, 1568, 1536, 256] + - [62, 6763.21] + - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] + - [19, 11234.7] + - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] + - [33, 16220.3] + - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] + - [18, 21014.2] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [41, 24846.7] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [8, 26163.3] + - - [1536, 512, 1, 64, 1568, 1568, 1536, 512] + - [26, 6484.37] + - - [1536, 512, 1, 128, 1568, 1568, 1536, 512] + - [59, 11439.0] + - - [1536, 512, 1, 256, 1568, 1568, 1536, 512] + - [56, 18857.9] + - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] + - [41, 26962.2] + - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] + - [19, 31979.5] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [25, 34351.7] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [11, 37319.0] + - - [1536, 1024, 1, 64, 1568, 1568, 1536, 1024] + - [27, 12532.8] + - - [1536, 1024, 1, 128, 1568, 1568, 1536, 1024] + - [62, 20006.6] + - - [1536, 1024, 1, 256, 1568, 1568, 1536, 1024] + - [28, 27122.0] + - - [1536, 1024, 1, 512, 1568, 1568, 1536, 1024] + - [27, 32482.5] + - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] + - [62, 34321.7] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [59, 37290.1] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [50, 37953.3] + - - [1536, 2048, 1, 64, 1568, 1568, 1536, 2048] + - [62, 17898.9] + - - [1536, 2048, 1, 128, 1568, 1568, 1536, 2048] + - [29, 25610.8] + - - [1536, 2048, 1, 256, 1568, 1568, 1536, 2048] + - [27, 31815.2] + - - [1536, 2048, 1, 512, 1568, 1568, 1536, 2048] + - [59, 34524.0] + - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 2048] + - [8, 37561.0] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [11, 39738.6] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [41, 40287.8] + - - [1536, 4096, 1, 64, 1568, 1568, 1536, 4096] + - [34, 22070.4] + - - [1536, 4096, 1, 128, 1568, 1568, 1536, 4096] + - [37, 29375.8] + - - [1536, 4096, 1, 256, 1568, 1568, 1536, 4096] + - [59, 33306.1] + - - [1536, 4096, 1, 512, 1568, 1568, 1536, 4096] + - [59, 37297.4] + - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 4096] + - [62, 39605.9] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] + - [59, 40255.3] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [52, 40215.5] + - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] + - [7, 1872.46] + - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] + - [32, 3337.65] + - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] + - [16, 5451.87] + - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] + - [7, 7943.76] + - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] + - [30, 10278.1] + - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] + - [30, 12300.4] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [30, 13004.6] + - - [3072, 128, 1, 64, 3104, 3104, 3072, 128] + - [1, 4681.16] + - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] + - [33, 7596.14] + - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] + - [11, 12038.3] + - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] + - [8, 17121.1] + - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] + - [7, 21645.7] + - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] + - [18, 24801.6] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [25, 26489.9] + - - [3072, 256, 1, 64, 3104, 3104, 3072, 256] + - [56, 7951.29] + - - [3072, 256, 1, 128, 3104, 3104, 3072, 256] + - [31, 13061.3] + - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] + - [31, 19438.7] + - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] + - [11, 26281.2] + - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] + - [41, 31973.1] + - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] + - [11, 34560.3] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [41, 37295.3] + - - [3072, 512, 1, 64, 3104, 3104, 3072, 512] + - [15, 12526.5] + - - [3072, 512, 1, 128, 3104, 3104, 3072, 512] + - [36, 19512.2] + - - [3072, 512, 1, 256, 3104, 3104, 3072, 512] + - [27, 26250.3] + - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] + - [62, 31807.7] + - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] + - [52, 34137.6] + - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] + - [56, 37148.2] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [59, 37135.7] + - - [3072, 1024, 1, 64, 3104, 3104, 3072, 1024] + - [35, 18067.5] + - - [3072, 1024, 1, 128, 3104, 3104, 3072, 1024] + - [36, 25568.6] + - - [3072, 1024, 1, 256, 3104, 3104, 3072, 1024] + - [27, 31411.9] + - - [3072, 1024, 1, 512, 3104, 3104, 3072, 1024] + - [59, 34285.2] + - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] + - [41, 37824.7] + - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] + - [25, 39731.2] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [41, 40255.9] + - - [3072, 2048, 1, 64, 3104, 3104, 3072, 2048] + - [20, 21989.7] + - - [3072, 2048, 1, 128, 3104, 3104, 3072, 2048] + - [59, 29567.8] + - - [3072, 2048, 1, 256, 3104, 3104, 3072, 2048] + - [15, 33288.8] + - - [3072, 2048, 1, 512, 3104, 3104, 3072, 2048] + - [59, 37445.3] + - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 2048] + - [62, 39627.4] + - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] + - [59, 40289.4] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [38, 40195.0] + - - [3072, 4096, 1, 64, 3104, 3104, 3072, 4096] + - [43, 25429.7] + - - [3072, 4096, 1, 128, 3104, 3104, 3072, 4096] + - [29, 31016.9] + - - [3072, 4096, 1, 256, 3104, 3104, 3072, 4096] + - [59, 35987.8] + - - [3072, 4096, 1, 512, 3104, 3104, 3072, 4096] + - [59, 39042.8] + - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 4096] + - [62, 40059.8] + - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 4096] + - [51, 40458.9] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [17, 40375.6] + - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] + - [24, 2510.81] + - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] + - [22, 4561.51] + - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] + - [30, 7388.4] + - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] + - [30, 10738.3] + - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] + - [16, 13631.0] + - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] + - [7, 16088.9] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [22, 17243.6] + - - [4096, 128, 1, 64, 4128, 4128, 4096, 128] + - [21, 5612.07] + - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] + - [31, 9841.45] + - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] + - [8, 15707.2] + - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] + - [19, 22468.9] + - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] + - [8, 28579.8] + - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] + - [7, 33432.2] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [39, 34105.0] + - - [4096, 256, 1, 64, 4128, 4128, 4096, 256] + - [56, 10348.3] + - - [4096, 256, 1, 128, 4128, 4128, 4096, 256] + - [29, 16414.1] + - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] + - [41, 23547.0] + - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] + - [8, 30174.9] + - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] + - [40, 34884.4] + - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] + - [41, 37141.5] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [7, 39111.0] + - - [4096, 512, 1, 64, 4128, 4128, 4096, 512] + - [59, 14532.0] + - - [4096, 512, 1, 128, 4128, 4128, 4096, 512] + - [29, 21793.9] + - - [4096, 512, 1, 256, 4128, 4128, 4096, 512] + - [27, 28396.9] + - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] + - [8, 33692.4] + - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] + - [25, 36478.4] + - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] + - [41, 39067.9] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [41, 40319.3] + - - [4096, 1024, 1, 64, 4128, 4128, 4096, 1024] + - [35, 19888.6] + - - [4096, 1024, 1, 128, 4128, 4128, 4096, 1024] + - [27, 27598.4] + - - [4096, 1024, 1, 256, 4128, 4128, 4096, 1024] + - [59, 33345.0] + - - [4096, 1024, 1, 512, 4128, 4128, 4096, 1024] + - [62, 35574.4] + - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] + - [59, 38600.6] + - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] + - [11, 40177.1] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [41, 40497.8] + - - [4096, 2048, 1, 64, 4128, 4128, 4096, 2048] + - [35, 23611.2] + - - [4096, 2048, 1, 128, 4128, 4128, 4096, 2048] + - [13, 30611.0] + - - [4096, 2048, 1, 256, 4128, 4128, 4096, 2048] + - [59, 34723.1] + - - [4096, 2048, 1, 512, 4128, 4128, 4096, 2048] + - [59, 38192.1] + - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 2048] + - [62, 40215.5] + - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] + - [52, 40698.2] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [16, 40568.9] + - - [4096, 4096, 1, 64, 4128, 4128, 4096, 4096] + - [27, 24612.5] + - - [4096, 4096, 1, 128, 4128, 4128, 4096, 4096] + - [59, 30943.6] + - - [4096, 4096, 1, 256, 4128, 4128, 4096, 4096] + - [59, 34765.0] + - - [4096, 4096, 1, 512, 4128, 4128, 4096, 4096] + - [59, 39125.2] + - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 4096] + - [59, 40453.0] + - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 4096] + - [59, 40600.8] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [31, 40308.5] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_HB_GB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_HB_GB.yaml new file mode 100644 index 00000000000..f70a671778c --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_HB_GB.yaml @@ -0,0 +1,17853 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [3, 34.9153] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [11, 58.2736] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [33, 98.8013] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [45, 152.299] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [46, 214.598] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [46, 271.239] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [45, 301.196] + - - [64, 128, 1, 64, 96, 96, 96, 128] + - [9, 61.71] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [29, 112.412] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [11, 191.381] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [46, 298.633] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [51, 426.641] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [61, 540.086] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [46, 602.122] + - - [64, 256, 1, 64, 96, 96, 96, 256] + - [29, 149.2] + - - [64, 256, 1, 128, 96, 96, 160, 256] + - [32, 268.212] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [46, 448.829] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [46, 665.763] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [45, 922.18] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [56, 1138.1] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [46, 1240.21] + - - [64, 512, 1, 64, 96, 96, 96, 512] + - [23, 329.223] + - - [64, 512, 1, 128, 96, 96, 160, 512] + - [46, 581.573] + - - [64, 512, 1, 256, 96, 96, 288, 512] + - [45, 963.322] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [52, 1406.19] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [46, 1925.32] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [61, 2332.03] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [52, 2525.07] + - - [64, 1024, 1, 64, 96, 96, 96, 1024] + - [46, 644.385] + - - [64, 1024, 1, 128, 96, 96, 160, 1024] + - [45, 1150.23] + - - [64, 1024, 1, 256, 96, 96, 288, 1024] + - [59, 1932.87] + - - [64, 1024, 1, 512, 96, 96, 544, 1024] + - [60, 2905.15] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [59, 3908.95] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [59, 4710.06] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [51, 5090.37] + - - [64, 2048, 1, 64, 96, 96, 96, 2048] + - [16, 1293.74] + - - [64, 2048, 1, 128, 96, 96, 160, 2048] + - [55, 2312.82] + - - [64, 2048, 1, 256, 96, 96, 288, 2048] + - [53, 3847.1] + - - [64, 2048, 1, 512, 96, 96, 544, 2048] + - [57, 5772.31] + - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] + - [60, 7770.84] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [55, 9587.67] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [0, 8013.37] + - - [64, 4096, 1, 64, 96, 96, 96, 4096] + - [24, 2399.14] + - - [64, 4096, 1, 128, 96, 96, 160, 4096] + - [18, 4148.16] + - - [64, 4096, 1, 256, 96, 96, 288, 4096] + - [40, 6804.79] + - - [64, 4096, 1, 512, 96, 96, 544, 4096] + - [18, 10149.6] + - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] + - [5, 13540.3] + - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] + - [32, 16308.4] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [10, 17122.4] + - - [128, 64, 1, 64, 160, 160, 128, 96] + - [9, 70.167] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [15, 121.857] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [8, 197.882] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [8, 303.913] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [55, 424.934] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [51, 539.305] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [55, 604.454] + - - [128, 128, 1, 64, 160, 160, 128, 128] + - [27, 168.392] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [1, 289.183] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [19, 474.576] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [52, 700.57] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [52, 957.057] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [48, 1167.11] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [62, 1248.14] + - - [128, 256, 1, 64, 160, 160, 128, 256] + - [62, 366.635] + - - [128, 256, 1, 128, 160, 160, 160, 256] + - [8, 653.726] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [59, 1055.57] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [52, 1546.0] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [46, 2044.51] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [45, 2397.95] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [59, 2538.78] + - - [128, 512, 1, 64, 160, 160, 128, 512] + - [62, 776.867] + - - [128, 512, 1, 128, 160, 160, 160, 512] + - [56, 1360.69] + - - [128, 512, 1, 256, 160, 160, 288, 512] + - [46, 2213.94] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [49, 3190.8] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [52, 4095.25] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [61, 4822.78] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [56, 5149.25] + - - [128, 1024, 1, 64, 160, 160, 128, 1024] + - [33, 1599.96] + - - [128, 1024, 1, 128, 160, 160, 160, 1024] + - [46, 2781.37] + - - [128, 1024, 1, 256, 160, 160, 288, 1024] + - [11, 4393.67] + - - [128, 1024, 1, 512, 160, 160, 544, 1024] + - [44, 6307.83] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [59, 8252.44] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [58, 9670.92] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [59, 10377.1] + - - [128, 2048, 1, 64, 160, 160, 128, 2048] + - [59, 2794.81] + - - [128, 2048, 1, 128, 160, 160, 160, 2048] + - [52, 4949.03] + - - [128, 2048, 1, 256, 160, 160, 288, 2048] + - [59, 8096.18] + - - [128, 2048, 1, 512, 160, 160, 544, 2048] + - [50, 11933.7] + - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] + - [47, 15958.4] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [58, 19716.9] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [47, 18578.1] + - - [128, 4096, 1, 64, 160, 160, 128, 4096] + - [25, 5083.23] + - - [128, 4096, 1, 128, 160, 160, 160, 4096] + - [19, 8912.2] + - - [128, 4096, 1, 256, 160, 160, 288, 4096] + - [8, 14500.7] + - - [128, 4096, 1, 512, 160, 160, 544, 4096] + - [30, 21445.7] + - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] + - [11, 28067.3] + - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] + - [24, 32858.3] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [33, 33553.4] + - - [256, 64, 1, 64, 288, 288, 256, 96] + - [29, 157.728] + - - [256, 64, 1, 128, 288, 288, 256, 160] + - [15, 274.892] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [46, 431.027] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [49, 661.562] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [46, 920.914] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [61, 1146.42] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [46, 1233.98] + - - [256, 128, 1, 64, 288, 288, 256, 128] + - [19, 365.485] + - - [256, 128, 1, 128, 288, 288, 256, 160] + - [11, 653.522] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [59, 1058.5] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [46, 1543.3] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [46, 2041.28] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [59, 2387.62] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [59, 2536.38] + - - [256, 256, 1, 64, 288, 288, 256, 256] + - [33, 787.663] + - - [256, 256, 1, 128, 288, 288, 256, 256] + - [4, 1365.57] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [52, 2210.15] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [59, 3105.76] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [52, 4094.0] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [56, 4833.37] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [52, 5127.12] + - - [256, 512, 1, 64, 288, 288, 256, 512] + - [11, 1600.88] + - - [256, 512, 1, 128, 288, 288, 256, 512] + - [59, 2655.04] + - - [256, 512, 1, 256, 288, 288, 288, 512] + - [46, 4520.35] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [59, 6483.34] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [59, 8388.61] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [54, 9743.58] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [59, 10426.1] + - - [256, 1024, 1, 64, 288, 288, 256, 1024] + - [46, 2973.1] + - - [256, 1024, 1, 128, 288, 288, 256, 1024] + - [59, 5197.4] + - - [256, 1024, 1, 256, 288, 288, 288, 1024] + - [46, 8447.77] + - - [256, 1024, 1, 512, 288, 288, 544, 1024] + - [44, 12321.5] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [59, 16261.9] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [49, 19341.1] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [46, 15925.5] + - - [256, 2048, 1, 64, 288, 288, 256, 2048] + - [31, 5006.65] + - - [256, 2048, 1, 128, 288, 288, 256, 2048] + - [31, 8861.6] + - - [256, 2048, 1, 256, 288, 288, 288, 2048] + - [19, 14450.7] + - - [256, 2048, 1, 512, 288, 288, 544, 2048] + - [19, 21171.7] + - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] + - [11, 27292.5] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [8, 32448.2] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [19, 34710.7] + - - [256, 4096, 1, 64, 288, 288, 256, 4096] + - [46, 8869.79] + - - [256, 4096, 1, 128, 288, 288, 256, 4096] + - [59, 14643.0] + - - [256, 4096, 1, 256, 288, 288, 288, 4096] + - [25, 23511.9] + - - [256, 4096, 1, 512, 288, 288, 544, 4096] + - [25, 30297.5] + - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] + - [7, 35413.7] + - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] + - [7, 37152.4] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [11, 39461.3] + - - [384, 64, 1, 64, 416, 416, 384, 96] + - [27, 244.006] + - - [384, 64, 1, 128, 416, 416, 384, 160] + - [8, 399.865] + - - [384, 64, 1, 256, 416, 416, 384, 288] + - [59, 690.762] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [46, 1051.91] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [59, 1401.06] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [59, 1707.84] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [60, 1858.73] + - - [384, 128, 1, 64, 416, 416, 384, 128] + - [8, 557.756] + - - [384, 128, 1, 128, 416, 416, 384, 160] + - [11, 999.278] + - - [384, 128, 1, 256, 416, 416, 384, 288] + - [59, 1553.64] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [46, 2289.47] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [59, 3020.93] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [52, 3573.55] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [50, 3783.91] + - - [384, 256, 1, 64, 416, 416, 384, 256] + - [33, 1127.91] + - - [384, 256, 1, 128, 416, 416, 384, 256] + - [56, 1963.63] + - - [384, 256, 1, 256, 416, 416, 384, 288] + - [46, 3210.75] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [46, 4681.59] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [46, 6143.62] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [50, 7282.05] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [44, 7603.12] + - - [384, 512, 1, 64, 416, 416, 384, 512] + - [8, 2291.13] + - - [384, 512, 1, 128, 416, 416, 384, 512] + - [59, 3987.63] + - - [384, 512, 1, 256, 416, 416, 384, 512] + - [49, 6472.69] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [52, 9422.76] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [59, 12321.1] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [49, 14284.6] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [62, 15235.6] + - - [384, 1024, 1, 64, 416, 416, 384, 1024] + - [31, 3940.16] + - - [384, 1024, 1, 128, 416, 416, 384, 1024] + - [33, 6951.91] + - - [384, 1024, 1, 256, 416, 416, 384, 1024] + - [11, 11229.7] + - - [384, 1024, 1, 512, 416, 416, 544, 1024] + - [8, 15833.8] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [41, 20747.9] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [40, 24599.3] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [11, 26273.9] + - - [384, 2048, 1, 64, 416, 416, 384, 2048] + - [59, 6458.57] + - - [384, 2048, 1, 128, 416, 416, 384, 2048] + - [36, 11091.2] + - - [384, 2048, 1, 256, 416, 416, 384, 2048] + - [59, 18470.3] + - - [384, 2048, 1, 512, 416, 416, 544, 2048] + - [31, 25392.8] + - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] + - [41, 31675.0] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [8, 34049.6] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [41, 37367.5] + - - [384, 4096, 1, 64, 416, 416, 384, 4096] + - [42, 12273.1] + - - [384, 4096, 1, 128, 416, 416, 384, 4096] + - [59, 19450.0] + - - [384, 4096, 1, 256, 416, 416, 384, 4096] + - [59, 26313.8] + - - [384, 4096, 1, 512, 416, 416, 544, 4096] + - [52, 31963.0] + - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] + - [46, 33835.0] + - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] + - [46, 36773.3] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [48, 36908.1] + - - [768, 64, 1, 64, 800, 800, 768, 96] + - [2, 535.717] + - - [768, 64, 1, 128, 800, 800, 768, 160] + - [12, 918.863] + - - [768, 64, 1, 256, 800, 800, 768, 288] + - [45, 1455.34] + - - [768, 64, 1, 512, 800, 800, 768, 544] + - [55, 2118.34] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [45, 2845.37] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [58, 3419.04] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [51, 3699.5] + - - [768, 128, 1, 64, 800, 800, 768, 128] + - [33, 1096.65] + - - [768, 128, 1, 128, 800, 800, 768, 160] + - [33, 1963.63] + - - [768, 128, 1, 256, 800, 800, 768, 288] + - [59, 3191.2] + - - [768, 128, 1, 512, 800, 800, 768, 544] + - [46, 4675.07] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [46, 6065.16] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [59, 7186.65] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [59, 7484.55] + - - [768, 256, 1, 64, 800, 800, 768, 256] + - [6, 2168.72] + - - [768, 256, 1, 128, 800, 800, 768, 256] + - [49, 3992.04] + - - [768, 256, 1, 256, 800, 800, 768, 288] + - [59, 6497.76] + - - [768, 256, 1, 512, 800, 800, 768, 544] + - [59, 9469.76] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [45, 12084.4] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [51, 14038.5] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [62, 15070.8] + - - [768, 512, 1, 64, 800, 800, 768, 512] + - [33, 3800.34] + - - [768, 512, 1, 128, 800, 800, 768, 512] + - [31, 6970.18] + - - [768, 512, 1, 256, 800, 800, 768, 512] + - [33, 11278.8] + - - [768, 512, 1, 512, 800, 800, 768, 544] + - [8, 16259.7] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [19, 20658.4] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [11, 24612.1] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [41, 26264.9] + - - [768, 1024, 1, 64, 800, 800, 768, 1024] + - [36, 6458.57] + - - [768, 1024, 1, 128, 800, 800, 768, 1024] + - [21, 11061.9] + - - [768, 1024, 1, 256, 800, 800, 768, 1024] + - [59, 18391.0] + - - [768, 1024, 1, 512, 800, 800, 768, 1024] + - [41, 25383.2] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [41, 31530.0] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [11, 34229.0] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [11, 37142.6] + - - [768, 2048, 1, 64, 800, 800, 768, 2048] + - [29, 12690.8] + - - [768, 2048, 1, 128, 800, 800, 768, 2048] + - [27, 19917.5] + - - [768, 2048, 1, 256, 800, 800, 768, 2048] + - [29, 26832.9] + - - [768, 2048, 1, 512, 800, 800, 768, 2048] + - [15, 32077.6] + - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] + - [59, 34264.7] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [59, 37386.6] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [52, 37998.9] + - - [768, 4096, 1, 64, 800, 800, 768, 4096] + - [27, 18155.5] + - - [768, 4096, 1, 128, 800, 800, 768, 4096] + - [29, 25552.3] + - - [768, 4096, 1, 256, 800, 800, 768, 4096] + - [59, 31741.2] + - - [768, 4096, 1, 512, 800, 800, 768, 4096] + - [59, 34249.4] + - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] + - [41, 37706.9] + - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] + - [11, 39802.2] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [6, 40179.8] + - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] + - [14, 1081.01] + - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] + - [12, 1851.79] + - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] + - [45, 2932.4] + - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] + - [61, 4234.18] + - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] + - [51, 5783.58] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [62, 6876.85] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [53, 6858.93] + - - [1536, 128, 1, 64, 1568, 1568, 1536, 128] + - [19, 2272.1] + - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] + - [33, 3979.42] + - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] + - [59, 6451.14] + - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] + - [44, 9027.29] + - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] + - [45, 11830.9] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [46, 14157.5] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [33, 13835.5] + - - [1536, 256, 1, 64, 1568, 1568, 1536, 256] + - [31, 3762.28] + - - [1536, 256, 1, 128, 1568, 1568, 1536, 256] + - [62, 6763.21] + - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] + - [19, 11234.7] + - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] + - [33, 16220.3] + - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] + - [18, 21014.2] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [41, 24846.7] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [8, 26163.3] + - - [1536, 512, 1, 64, 1568, 1568, 1536, 512] + - [26, 6484.37] + - - [1536, 512, 1, 128, 1568, 1568, 1536, 512] + - [59, 11439.0] + - - [1536, 512, 1, 256, 1568, 1568, 1536, 512] + - [56, 18857.9] + - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] + - [41, 26962.2] + - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] + - [19, 31979.5] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [25, 34351.7] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [11, 37319.0] + - - [1536, 1024, 1, 64, 1568, 1568, 1536, 1024] + - [27, 12532.8] + - - [1536, 1024, 1, 128, 1568, 1568, 1536, 1024] + - [62, 20006.6] + - - [1536, 1024, 1, 256, 1568, 1568, 1536, 1024] + - [28, 27122.0] + - - [1536, 1024, 1, 512, 1568, 1568, 1536, 1024] + - [27, 32482.5] + - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] + - [62, 34321.7] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [59, 37290.1] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [50, 37953.3] + - - [1536, 2048, 1, 64, 1568, 1568, 1536, 2048] + - [62, 17898.9] + - - [1536, 2048, 1, 128, 1568, 1568, 1536, 2048] + - [29, 25610.8] + - - [1536, 2048, 1, 256, 1568, 1568, 1536, 2048] + - [27, 31815.2] + - - [1536, 2048, 1, 512, 1568, 1568, 1536, 2048] + - [59, 34524.0] + - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 2048] + - [8, 37561.0] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [11, 39738.6] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [41, 40287.8] + - - [1536, 4096, 1, 64, 1568, 1568, 1536, 4096] + - [34, 22070.4] + - - [1536, 4096, 1, 128, 1568, 1568, 1536, 4096] + - [37, 29375.8] + - - [1536, 4096, 1, 256, 1568, 1568, 1536, 4096] + - [59, 33306.1] + - - [1536, 4096, 1, 512, 1568, 1568, 1536, 4096] + - [59, 37297.4] + - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 4096] + - [62, 39605.9] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] + - [59, 40255.3] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [52, 40215.5] + - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] + - [7, 1872.46] + - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] + - [32, 3337.65] + - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] + - [16, 5451.87] + - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] + - [7, 7943.76] + - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] + - [30, 10278.1] + - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] + - [30, 12300.4] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [30, 13004.6] + - - [3072, 128, 1, 64, 3104, 3104, 3072, 128] + - [1, 4681.16] + - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] + - [33, 7596.14] + - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] + - [11, 12038.3] + - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] + - [8, 17121.1] + - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] + - [7, 21645.7] + - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] + - [18, 24801.6] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [25, 26489.9] + - - [3072, 256, 1, 64, 3104, 3104, 3072, 256] + - [56, 7951.29] + - - [3072, 256, 1, 128, 3104, 3104, 3072, 256] + - [31, 13061.3] + - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] + - [31, 19438.7] + - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] + - [11, 26281.2] + - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] + - [41, 31973.1] + - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] + - [11, 34560.3] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [41, 37295.3] + - - [3072, 512, 1, 64, 3104, 3104, 3072, 512] + - [15, 12526.5] + - - [3072, 512, 1, 128, 3104, 3104, 3072, 512] + - [36, 19512.2] + - - [3072, 512, 1, 256, 3104, 3104, 3072, 512] + - [27, 26250.3] + - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] + - [62, 31807.7] + - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] + - [52, 34137.6] + - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] + - [56, 37148.2] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [59, 37135.7] + - - [3072, 1024, 1, 64, 3104, 3104, 3072, 1024] + - [35, 18067.5] + - - [3072, 1024, 1, 128, 3104, 3104, 3072, 1024] + - [36, 25568.6] + - - [3072, 1024, 1, 256, 3104, 3104, 3072, 1024] + - [27, 31411.9] + - - [3072, 1024, 1, 512, 3104, 3104, 3072, 1024] + - [59, 34285.2] + - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] + - [41, 37824.7] + - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] + - [25, 39731.2] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [41, 40255.9] + - - [3072, 2048, 1, 64, 3104, 3104, 3072, 2048] + - [20, 21989.7] + - - [3072, 2048, 1, 128, 3104, 3104, 3072, 2048] + - [59, 29567.8] + - - [3072, 2048, 1, 256, 3104, 3104, 3072, 2048] + - [15, 33288.8] + - - [3072, 2048, 1, 512, 3104, 3104, 3072, 2048] + - [59, 37445.3] + - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 2048] + - [62, 39627.4] + - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] + - [59, 40289.4] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [38, 40195.0] + - - [3072, 4096, 1, 64, 3104, 3104, 3072, 4096] + - [43, 25429.7] + - - [3072, 4096, 1, 128, 3104, 3104, 3072, 4096] + - [29, 31016.9] + - - [3072, 4096, 1, 256, 3104, 3104, 3072, 4096] + - [59, 35987.8] + - - [3072, 4096, 1, 512, 3104, 3104, 3072, 4096] + - [59, 39042.8] + - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 4096] + - [62, 40059.8] + - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 4096] + - [51, 40458.9] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [17, 40375.6] + - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] + - [24, 2510.81] + - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] + - [22, 4561.51] + - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] + - [30, 7388.4] + - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] + - [30, 10738.3] + - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] + - [16, 13631.0] + - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] + - [7, 16088.9] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [22, 17243.6] + - - [4096, 128, 1, 64, 4128, 4128, 4096, 128] + - [21, 5612.07] + - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] + - [31, 9841.45] + - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] + - [8, 15707.2] + - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] + - [19, 22468.9] + - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] + - [8, 28579.8] + - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] + - [7, 33432.2] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [39, 34105.0] + - - [4096, 256, 1, 64, 4128, 4128, 4096, 256] + - [56, 10348.3] + - - [4096, 256, 1, 128, 4128, 4128, 4096, 256] + - [29, 16414.1] + - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] + - [41, 23547.0] + - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] + - [8, 30174.9] + - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] + - [40, 34884.4] + - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] + - [41, 37141.5] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [7, 39111.0] + - - [4096, 512, 1, 64, 4128, 4128, 4096, 512] + - [59, 14532.0] + - - [4096, 512, 1, 128, 4128, 4128, 4096, 512] + - [29, 21793.9] + - - [4096, 512, 1, 256, 4128, 4128, 4096, 512] + - [27, 28396.9] + - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] + - [8, 33692.4] + - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] + - [25, 36478.4] + - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] + - [41, 39067.9] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [41, 40319.3] + - - [4096, 1024, 1, 64, 4128, 4128, 4096, 1024] + - [35, 19888.6] + - - [4096, 1024, 1, 128, 4128, 4128, 4096, 1024] + - [27, 27598.4] + - - [4096, 1024, 1, 256, 4128, 4128, 4096, 1024] + - [59, 33345.0] + - - [4096, 1024, 1, 512, 4128, 4128, 4096, 1024] + - [62, 35574.4] + - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] + - [59, 38600.6] + - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] + - [11, 40177.1] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [41, 40497.8] + - - [4096, 2048, 1, 64, 4128, 4128, 4096, 2048] + - [35, 23611.2] + - - [4096, 2048, 1, 128, 4128, 4128, 4096, 2048] + - [13, 30611.0] + - - [4096, 2048, 1, 256, 4128, 4128, 4096, 2048] + - [59, 34723.1] + - - [4096, 2048, 1, 512, 4128, 4128, 4096, 2048] + - [59, 38192.1] + - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 2048] + - [62, 40215.5] + - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] + - [52, 40698.2] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [16, 40568.9] + - - [4096, 4096, 1, 64, 4128, 4128, 4096, 4096] + - [27, 24612.5] + - - [4096, 4096, 1, 128, 4128, 4128, 4096, 4096] + - [59, 30943.6] + - - [4096, 4096, 1, 256, 4128, 4128, 4096, 4096] + - [59, 34765.0] + - - [4096, 4096, 1, 512, 4128, 4128, 4096, 4096] + - [59, 39125.2] + - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 4096] + - [59, 40453.0] + - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 4096] + - [59, 40600.8] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [31, 40308.5] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_HHS_BH.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_HHS_BH.yaml new file mode 100644 index 00000000000..505c5d7b173 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_HHS_BH.yaml @@ -0,0 +1,8943 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM2 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA1_SVW4_VW4_WGM2 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW4_PLR1_SIA2_SVW1_VW1_WGM2 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW4_PLR1_SIA2_SVW4_VW4_WGM2 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW8_PLR1_SIA2_SVW1_VW1_WGM2 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW8_PLR1_SIA2_SVW4_VW4_WGM2 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW4_PLR1_SIA2_SVW1_VW1_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW4_PLR1_SIA2_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW8_PLR1_SIA2_SVW1_VW1_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW8_PLR1_SIA2_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW4_PLR1_SIA2_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW8_PLR1_SIA2_SVW1_VW1_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW8_PLR1_SIA2_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW4_PLR0_SIA1_SVW4_VW4_WGM2 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM2 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM2 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [0, 33.3475] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [1, 60.2562] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [19, 94.8167] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [28, 147.086] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [23, 210.168] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [27, 261.581] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [26, 295.415] + - - [64, 128, 1, 64, 96, 96, 96, 128] + - [1, 61.3707] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [1, 111.397] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [19, 189.736] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [27, 295.728] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [24, 414.929] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [23, 525.933] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [27, 585.808] + - - [64, 256, 1, 64, 96, 96, 96, 256] + - [1, 134.45] + - - [64, 256, 1, 128, 96, 96, 160, 256] + - [18, 242.98] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [24, 410.563] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [27, 635.406] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [27, 891.082] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [24, 1097.41] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [29, 1205.26] + - - [64, 512, 1, 64, 96, 96, 96, 512] + - [15, 302.359] + - - [64, 512, 1, 128, 96, 96, 160, 512] + - [24, 542.04] + - - [64, 512, 1, 256, 96, 96, 288, 512] + - [24, 912.8] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [24, 1372.82] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [29, 1876.75] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [29, 2242.72] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [29, 2438.11] + - - [64, 1024, 1, 64, 96, 96, 96, 1024] + - [19, 629.397] + - - [64, 1024, 1, 128, 96, 96, 160, 1024] + - [27, 1132.07] + - - [64, 1024, 1, 256, 96, 96, 288, 1024] + - [29, 1885.08] + - - [64, 1024, 1, 512, 96, 96, 544, 1024] + - [24, 2826.35] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [24, 3808.89] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [29, 4513.64] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [29, 4923.25] + - - [64, 2048, 1, 64, 96, 96, 96, 2048] + - [19, 1215.39] + - - [64, 2048, 1, 128, 96, 96, 160, 2048] + - [25, 2175.47] + - - [64, 2048, 1, 256, 96, 96, 288, 2048] + - [26, 3627.9] + - - [64, 2048, 1, 512, 96, 96, 544, 2048] + - [26, 5468.9] + - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] + - [28, 7433.39] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [28, 9170.99] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [3, 8846.25] + - - [64, 4096, 1, 64, 96, 96, 96, 4096] + - [2, 2195.68] + - - [64, 4096, 1, 128, 96, 96, 160, 4096] + - [5, 3933.22] + - - [64, 4096, 1, 256, 96, 96, 288, 4096] + - [11, 6499.63] + - - [64, 4096, 1, 512, 96, 96, 544, 4096] + - [18, 9777.63] + - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] + - [8, 13251.5] + - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] + - [18, 15685.1] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [5, 16704.3] + - - [128, 64, 1, 64, 160, 160, 128, 96] + - [16, 67.2941] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [7, 124.401] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [0, 196.068] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [19, 286.868] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [29, 413.007] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [27, 530.017] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [24, 585.059] + - - [128, 128, 1, 64, 160, 160, 128, 128] + - [15, 152.034] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [19, 277.18] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [19, 456.05] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [5, 674.052] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [27, 924.666] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [24, 1123.72] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [29, 1216.84] + - - [128, 256, 1, 64, 160, 160, 128, 256] + - [1, 343.736] + - - [128, 256, 1, 128, 160, 160, 160, 256] + - [11, 604.539] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [29, 981.35] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [29, 1454.71] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [27, 1948.45] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [29, 2327.01] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [24, 2473.28] + - - [128, 512, 1, 64, 160, 160, 128, 512] + - [3, 722.9] + - - [128, 512, 1, 128, 160, 160, 160, 512] + - [19, 1272.74] + - - [128, 512, 1, 256, 160, 160, 288, 512] + - [29, 2066.92] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [22, 3012.59] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [24, 3988.87] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [27, 4769.28] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [29, 4977.46] + - - [128, 1024, 1, 64, 160, 160, 128, 1024] + - [11, 1472.71] + - - [128, 1024, 1, 128, 160, 160, 160, 1024] + - [24, 2582.68] + - - [128, 1024, 1, 256, 160, 160, 288, 1024] + - [29, 4210.08] + - - [128, 1024, 1, 512, 160, 160, 544, 1024] + - [24, 6137.6] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [24, 8046.12] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [24, 9449.25] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [29, 10159.3] + - - [128, 2048, 1, 64, 160, 160, 128, 2048] + - [19, 2906.64] + - - [128, 2048, 1, 128, 160, 160, 160, 2048] + - [19, 5100.23] + - - [128, 2048, 1, 256, 160, 160, 288, 2048] + - [27, 8268.71] + - - [128, 2048, 1, 512, 160, 160, 544, 2048] + - [29, 11997.6] + - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] + - [27, 15820.9] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [27, 18992.7] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [11, 18019.1] + - - [128, 4096, 1, 64, 160, 160, 128, 4096] + - [19, 4825.2] + - - [128, 4096, 1, 128, 160, 160, 160, 4096] + - [19, 8484.03] + - - [128, 4096, 1, 256, 160, 160, 288, 4096] + - [11, 13846.8] + - - [128, 4096, 1, 512, 160, 160, 544, 4096] + - [5, 20582.3] + - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] + - [5, 27091.3] + - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] + - [17, 32514.8] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [5, 33128.8] + - - [256, 64, 1, 64, 288, 288, 256, 96] + - [0, 148.04] + - - [256, 64, 1, 128, 288, 288, 256, 160] + - [16, 271.933] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [0, 428.383] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [24, 635.018] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [29, 886.553] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [28, 1096.72] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [27, 1208.16] + - - [256, 128, 1, 64, 288, 288, 256, 128] + - [12, 348.246] + - - [256, 128, 1, 128, 288, 288, 256, 160] + - [11, 634.342] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [11, 982.96] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [29, 1448.43] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [24, 1926.75] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [27, 2324.75] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [29, 2466.64] + - - [256, 256, 1, 64, 288, 288, 256, 256] + - [19, 722.903] + - - [256, 256, 1, 128, 288, 288, 256, 256] + - [9, 1261.63] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [24, 2152.84] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [27, 3097.41] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [27, 4004.08] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [27, 4718.31] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [29, 4981.15] + - - [256, 512, 1, 64, 288, 288, 256, 512] + - [11, 1478.42] + - - [256, 512, 1, 128, 288, 288, 256, 512] + - [19, 2574.75] + - - [256, 512, 1, 256, 288, 288, 288, 512] + - [27, 4185.91] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [29, 6097.44] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [29, 8020.15] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [29, 9391.05] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [29, 10014.5] + - - [256, 1024, 1, 64, 288, 288, 256, 1024] + - [19, 2914.71] + - - [256, 1024, 1, 128, 288, 288, 256, 1024] + - [19, 5103.34] + - - [256, 1024, 1, 256, 288, 288, 288, 1024] + - [29, 7932.44] + - - [256, 1024, 1, 512, 288, 288, 544, 1024] + - [24, 12016.9] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [29, 15848.0] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [29, 18661.3] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [27, 18776.9] + - - [256, 2048, 1, 64, 288, 288, 256, 2048] + - [5, 4737.29] + - - [256, 2048, 1, 128, 288, 288, 256, 2048] + - [19, 8492.59] + - - [256, 2048, 1, 256, 288, 288, 288, 2048] + - [9, 14290.6] + - - [256, 2048, 1, 512, 288, 288, 544, 2048] + - [11, 20837.9] + - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] + - [5, 27317.4] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [5, 32149.7] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [3, 33865.9] + - - [256, 4096, 1, 64, 288, 288, 256, 4096] + - [15, 8499.07] + - - [256, 4096, 1, 128, 288, 288, 256, 4096] + - [24, 14293.6] + - - [256, 4096, 1, 256, 288, 288, 288, 4096] + - [19, 22544.2] + - - [256, 4096, 1, 512, 288, 288, 544, 4096] + - [17, 29125.4] + - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] + - [17, 34461.0] + - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] + - [3, 36147.9] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [3, 38170.8] + - - [384, 64, 1, 64, 416, 416, 384, 96] + - [1, 240.426] + - - [384, 64, 1, 128, 416, 416, 384, 160] + - [15, 435.935] + - - [384, 64, 1, 256, 416, 416, 384, 288] + - [10, 679.862] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [27, 998.719] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [24, 1368.37] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [24, 1659.57] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [26, 1798.48] + - - [384, 128, 1, 64, 416, 416, 384, 128] + - [19, 538.652] + - - [384, 128, 1, 128, 416, 416, 384, 160] + - [11, 980.589] + - - [384, 128, 1, 256, 416, 416, 384, 288] + - [29, 1562.69] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [29, 2290.29] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [25, 2974.84] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [24, 3473.77] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [27, 3719.3] + - - [384, 256, 1, 64, 416, 416, 384, 256] + - [19, 1157.36] + - - [384, 256, 1, 128, 416, 416, 384, 256] + - [3, 2009.08] + - - [384, 256, 1, 256, 416, 416, 384, 288] + - [24, 3112.63] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [29, 4552.4] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [24, 5973.69] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [29, 7023.88] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [29, 7430.08] + - - [384, 512, 1, 64, 416, 416, 384, 512] + - [11, 2239.34] + - - [384, 512, 1, 128, 416, 416, 384, 512] + - [5, 3895.61] + - - [384, 512, 1, 256, 416, 416, 384, 512] + - [27, 6304.02] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [24, 9187.06] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [27, 11949.5] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [24, 13987.7] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [24, 14798.4] + - - [384, 1024, 1, 64, 416, 416, 384, 1024] + - [9, 3709.02] + - - [384, 1024, 1, 128, 416, 416, 384, 1024] + - [5, 6579.24] + - - [384, 1024, 1, 256, 416, 416, 384, 1024] + - [9, 10616.2] + - - [384, 1024, 1, 512, 416, 416, 544, 1024] + - [3, 15411.9] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [5, 20280.6] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [3, 23975.9] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [19, 25768.0] + - - [384, 2048, 1, 64, 416, 416, 384, 2048] + - [19, 6402.68] + - - [384, 2048, 1, 128, 416, 416, 384, 2048] + - [19, 10935.7] + - - [384, 2048, 1, 256, 416, 416, 384, 2048] + - [11, 18285.7] + - - [384, 2048, 1, 512, 416, 416, 544, 2048] + - [19, 25383.0] + - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] + - [19, 31209.6] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [11, 33705.2] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [19, 36970.6] + - - [384, 4096, 1, 64, 416, 416, 384, 4096] + - [15, 12407.6] + - - [384, 4096, 1, 128, 416, 416, 384, 4096] + - [6, 18847.1] + - - [384, 4096, 1, 256, 416, 416, 384, 4096] + - [24, 25508.5] + - - [384, 4096, 1, 512, 416, 416, 544, 4096] + - [29, 31019.7] + - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] + - [29, 32931.3] + - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] + - [27, 35585.6] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [29, 36275.9] + - - [768, 64, 1, 64, 800, 800, 768, 96] + - [6, 500.191] + - - [768, 64, 1, 128, 800, 800, 768, 160] + - [7, 904.588] + - - [768, 64, 1, 256, 800, 800, 768, 288] + - [9, 1407.95] + - - [768, 64, 1, 512, 800, 800, 768, 544] + - [29, 2078.95] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [24, 2727.69] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [23, 3304.43] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [24, 3579.76] + - - [768, 128, 1, 64, 800, 800, 768, 128] + - [19, 1064.72] + - - [768, 128, 1, 128, 800, 800, 768, 160] + - [11, 1926.64] + - - [768, 128, 1, 256, 800, 800, 768, 288] + - [24, 3102.28] + - - [768, 128, 1, 512, 800, 800, 768, 544] + - [24, 4544.59] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [24, 5996.12] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [24, 7004.09] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [29, 7321.33] + - - [768, 256, 1, 64, 800, 800, 768, 256] + - [19, 2131.25] + - - [768, 256, 1, 128, 800, 800, 768, 256] + - [19, 3751.61] + - - [768, 256, 1, 256, 800, 800, 768, 288] + - [29, 6078.68] + - - [768, 256, 1, 512, 800, 800, 768, 544] + - [24, 9161.14] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [27, 11783.8] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [29, 13862.5] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [27, 14794.9] + - - [768, 512, 1, 64, 800, 800, 768, 512] + - [3, 3885.4] + - - [768, 512, 1, 128, 800, 800, 768, 512] + - [5, 6835.71] + - - [768, 512, 1, 256, 800, 800, 768, 512] + - [5, 11035.2] + - - [768, 512, 1, 512, 800, 800, 768, 544] + - [19, 15944.0] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [11, 20272.4] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [19, 24107.3] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [5, 25649.4] + - - [768, 1024, 1, 64, 800, 800, 768, 1024] + - [15, 6407.57] + - - [768, 1024, 1, 128, 800, 800, 768, 1024] + - [7, 10938.0] + - - [768, 1024, 1, 256, 800, 800, 768, 1024] + - [9, 18267.5] + - - [768, 1024, 1, 512, 800, 800, 768, 1024] + - [17, 25837.5] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [19, 31605.2] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [5, 33986.9] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [11, 36786.5] + - - [768, 2048, 1, 64, 800, 800, 768, 2048] + - [16, 12255.0] + - - [768, 2048, 1, 128, 800, 800, 768, 2048] + - [16, 18944.8] + - - [768, 2048, 1, 256, 800, 800, 768, 2048] + - [19, 26147.8] + - - [768, 2048, 1, 512, 800, 800, 768, 2048] + - [5, 31502.6] + - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] + - [29, 33500.5] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [29, 36337.9] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [29, 37489.7] + - - [768, 4096, 1, 64, 800, 800, 768, 4096] + - [21, 16901.1] + - - [768, 4096, 1, 128, 800, 800, 768, 4096] + - [16, 24224.0] + - - [768, 4096, 1, 256, 800, 800, 768, 4096] + - [16, 30558.3] + - - [768, 4096, 1, 512, 800, 800, 768, 4096] + - [29, 33524.2] + - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] + - [29, 36968.0] + - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] + - [29, 38855.1] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [3, 39145.7] + - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] + - [5, 959.499] + - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] + - [1, 1739.4] + - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] + - [9, 2761.83] + - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] + - [25, 4153.09] + - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] + - [24, 5531.84] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [28, 6735.99] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [27, 6695.34] + - - [1536, 128, 1, 64, 1568, 1568, 1536, 128] + - [19, 2156.07] + - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] + - [11, 3877.6] + - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] + - [11, 6205.34] + - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] + - [24, 9034.54] + - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] + - [24, 11692.1] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [24, 13805.5] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [23, 12360.0] + - - [1536, 256, 1, 64, 1568, 1568, 1536, 256] + - [29, 3770.15] + - - [1536, 256, 1, 128, 1568, 1568, 1536, 256] + - [3, 6807.05] + - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] + - [5, 10966.6] + - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] + - [19, 15902.4] + - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] + - [5, 20509.9] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [5, 24440.1] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [5, 25719.9] + - - [1536, 512, 1, 64, 1568, 1568, 1536, 512] + - [29, 6648.81] + - - [1536, 512, 1, 128, 1568, 1568, 1536, 512] + - [19, 12339.1] + - - [1536, 512, 1, 256, 1568, 1568, 1536, 512] + - [5, 18775.1] + - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] + - [5, 25721.9] + - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] + - [17, 31457.1] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [5, 33933.2] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [9, 36789.4] + - - [1536, 1024, 1, 64, 1568, 1568, 1536, 1024] + - [16, 12281.9] + - - [1536, 1024, 1, 128, 1568, 1568, 1536, 1024] + - [16, 19119.2] + - - [1536, 1024, 1, 256, 1568, 1568, 1536, 1024] + - [16, 25930.7] + - - [1536, 1024, 1, 512, 1568, 1568, 1536, 1024] + - [29, 31323.7] + - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] + - [29, 33901.0] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [27, 36396.2] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [27, 37584.2] + - - [1536, 2048, 1, 64, 1568, 1568, 1536, 2048] + - [15, 16976.6] + - - [1536, 2048, 1, 128, 1568, 1568, 1536, 2048] + - [15, 24325.0] + - - [1536, 2048, 1, 256, 1568, 1568, 1536, 2048] + - [29, 30859.2] + - - [1536, 2048, 1, 512, 1568, 1568, 1536, 2048] + - [29, 33834.1] + - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 2048] + - [29, 37161.2] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [27, 38910.3] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [17, 39133.9] + - - [1536, 4096, 1, 64, 1568, 1568, 1536, 4096] + - [21, 21167.6] + - - [1536, 4096, 1, 128, 1568, 1568, 1536, 4096] + - [16, 28283.0] + - - [1536, 4096, 1, 256, 1568, 1568, 1536, 4096] + - [16, 32543.4] + - - [1536, 4096, 1, 512, 1568, 1568, 1536, 4096] + - [29, 36737.0] + - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 4096] + - [29, 39192.2] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] + - [29, 39767.9] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [22, 39218.6] + - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] + - [18, 1860.54] + - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] + - [10, 3224.71] + - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] + - [10, 5030.61] + - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] + - [10, 7427.89] + - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] + - [2, 9882.95] + - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] + - [4, 11801.7] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [2, 12723.2] + - - [3072, 128, 1, 64, 3104, 3104, 3072, 128] + - [5, 4342.66] + - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] + - [19, 7119.02] + - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] + - [5, 11401.4] + - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] + - [5, 16782.7] + - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] + - [5, 21243.6] + - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] + - [5, 24716.9] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [11, 25831.7] + - - [3072, 256, 1, 64, 3104, 3104, 3072, 256] + - [5, 7827.6] + - - [3072, 256, 1, 128, 3104, 3104, 3072, 256] + - [11, 12872.5] + - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] + - [9, 19403.0] + - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] + - [9, 26224.5] + - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] + - [5, 31837.7] + - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] + - [19, 34350.1] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [5, 37051.4] + - - [3072, 512, 1, 64, 3104, 3104, 3072, 512] + - [15, 12697.1] + - - [3072, 512, 1, 128, 3104, 3104, 3072, 512] + - [14, 19472.4] + - - [3072, 512, 1, 256, 3104, 3104, 3072, 512] + - [15, 26275.9] + - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] + - [9, 31548.3] + - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] + - [29, 33573.8] + - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] + - [29, 36265.5] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [10, 36745.4] + - - [3072, 1024, 1, 64, 3104, 3104, 3072, 1024] + - [20, 16943.8] + - - [3072, 1024, 1, 128, 3104, 3104, 3072, 1024] + - [15, 24218.2] + - - [3072, 1024, 1, 256, 3104, 3104, 3072, 1024] + - [16, 30664.2] + - - [3072, 1024, 1, 512, 3104, 3104, 3072, 1024] + - [29, 33449.0] + - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] + - [29, 37220.0] + - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] + - [29, 38894.3] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [9, 39112.3] + - - [3072, 2048, 1, 64, 3104, 3104, 3072, 2048] + - [21, 21172.1] + - - [3072, 2048, 1, 128, 3104, 3104, 3072, 2048] + - [16, 28450.9] + - - [3072, 2048, 1, 256, 3104, 3104, 3072, 2048] + - [29, 32295.4] + - - [3072, 2048, 1, 512, 3104, 3104, 3072, 2048] + - [29, 36761.3] + - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 2048] + - [29, 39147.5] + - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] + - [29, 39866.8] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [19, 39321.9] + - - [3072, 4096, 1, 64, 3104, 3104, 3072, 4096] + - [12, 24300.0] + - - [3072, 4096, 1, 128, 3104, 3104, 3072, 4096] + - [15, 29880.3] + - - [3072, 4096, 1, 256, 3104, 3104, 3072, 4096] + - [15, 35099.8] + - - [3072, 4096, 1, 512, 3104, 3104, 3072, 4096] + - [29, 38357.6] + - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 4096] + - [29, 39665.7] + - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 4096] + - [27, 39551.4] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [29, 39057.0] + - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] + - [18, 1941.12] + - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] + - [18, 3898.94] + - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] + - [8, 6458.35] + - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] + - [2, 9728.69] + - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] + - [8, 13058.7] + - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] + - [10, 15758.3] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [8, 16838.4] + - - [4096, 128, 1, 64, 4128, 4128, 4096, 128] + - [19, 5578.46] + - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] + - [5, 10244.0] + - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] + - [19, 16097.0] + - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] + - [5, 22698.7] + - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] + - [11, 28528.0] + - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] + - [5, 32699.0] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [17, 33836.6] + - - [4096, 256, 1, 64, 4128, 4128, 4096, 256] + - [16, 10047.7] + - - [4096, 256, 1, 128, 4128, 4128, 4096, 256] + - [19, 16085.4] + - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] + - [9, 22574.6] + - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] + - [5, 29203.0] + - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] + - [17, 34067.4] + - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] + - [9, 35986.2] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [17, 38270.9] + - - [4096, 512, 1, 64, 4128, 4128, 4096, 512] + - [16, 14686.2] + - - [4096, 512, 1, 128, 4128, 4128, 4096, 512] + - [7, 21709.2] + - - [4096, 512, 1, 256, 4128, 4128, 4096, 512] + - [19, 28392.2] + - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] + - [19, 33614.1] + - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] + - [19, 35320.3] + - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] + - [9, 37922.5] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [17, 39119.3] + - - [4096, 1024, 1, 64, 4128, 4128, 4096, 1024] + - [13, 18758.5] + - - [4096, 1024, 1, 128, 4128, 4128, 4096, 1024] + - [16, 26225.8] + - - [4096, 1024, 1, 256, 4128, 4128, 4096, 1024] + - [16, 32201.8] + - - [4096, 1024, 1, 512, 4128, 4128, 4096, 1024] + - [29, 35193.0] + - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] + - [29, 38060.3] + - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] + - [29, 39637.9] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [18, 39003.6] + - - [4096, 2048, 1, 64, 4128, 4128, 4096, 2048] + - [21, 22616.4] + - - [4096, 2048, 1, 128, 4128, 4128, 4096, 2048] + - [16, 29748.3] + - - [4096, 2048, 1, 256, 4128, 4128, 4096, 2048] + - [29, 33752.1] + - - [4096, 2048, 1, 512, 4128, 4128, 4096, 2048] + - [29, 37550.1] + - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 2048] + - [29, 39729.6] + - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] + - [27, 40134.6] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [29, 39859.9] + - - [4096, 4096, 1, 64, 4128, 4128, 4096, 4096] + - [21, 23473.8] + - - [4096, 4096, 1, 128, 4128, 4128, 4096, 4096] + - [19, 30001.5] + - - [4096, 4096, 1, 256, 4128, 4128, 4096, 4096] + - [29, 34046.1] + - - [4096, 4096, 1, 512, 4128, 4128, 4096, 4096] + - [29, 38624.9] + - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 4096] + - [16, 39558.4] + - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 4096] + - [27, 39854.3] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [18, 38537.4] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_HHS_BH_GB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_HHS_BH_GB.yaml new file mode 100644 index 00000000000..7810b3d95c6 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_HHS_BH_GB.yaml @@ -0,0 +1,8943 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM2 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA1_SVW4_VW4_WGM2 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW4_PLR1_SIA2_SVW1_VW1_WGM2 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW4_PLR1_SIA2_SVW4_VW4_WGM2 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW8_PLR1_SIA2_SVW1_VW1_WGM2 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW8_PLR1_SIA2_SVW4_VW4_WGM2 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW4_PLR1_SIA2_SVW1_VW1_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW4_PLR1_SIA2_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW8_PLR1_SIA2_SVW1_VW1_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW8_PLR1_SIA2_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW4_PLR1_SIA2_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS0_GRVW8_PLR1_SIA2_SVW1_VW1_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_AMAS3_GRVW8_PLR1_SIA2_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW4_PLR0_SIA1_SVW4_VW4_WGM2 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM2 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM2 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 2 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS0_GRVW8_PLR0_SIA1_SVW1_VW1_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 1 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_AMAS3_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [0, 33.3475] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [1, 60.2562] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [19, 94.8167] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [28, 147.086] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [23, 210.168] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [27, 261.581] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [26, 295.415] + - - [64, 128, 1, 64, 96, 96, 96, 128] + - [1, 61.3707] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [1, 111.397] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [19, 189.736] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [27, 295.728] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [24, 414.929] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [23, 525.933] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [27, 585.808] + - - [64, 256, 1, 64, 96, 96, 96, 256] + - [1, 134.45] + - - [64, 256, 1, 128, 96, 96, 160, 256] + - [18, 242.98] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [24, 410.563] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [27, 635.406] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [27, 891.082] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [24, 1097.41] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [29, 1205.26] + - - [64, 512, 1, 64, 96, 96, 96, 512] + - [15, 302.359] + - - [64, 512, 1, 128, 96, 96, 160, 512] + - [24, 542.04] + - - [64, 512, 1, 256, 96, 96, 288, 512] + - [24, 912.8] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [24, 1372.82] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [29, 1876.75] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [29, 2242.72] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [29, 2438.11] + - - [64, 1024, 1, 64, 96, 96, 96, 1024] + - [19, 629.397] + - - [64, 1024, 1, 128, 96, 96, 160, 1024] + - [27, 1132.07] + - - [64, 1024, 1, 256, 96, 96, 288, 1024] + - [29, 1885.08] + - - [64, 1024, 1, 512, 96, 96, 544, 1024] + - [24, 2826.35] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [24, 3808.89] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [29, 4513.64] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [29, 4923.25] + - - [64, 2048, 1, 64, 96, 96, 96, 2048] + - [19, 1215.39] + - - [64, 2048, 1, 128, 96, 96, 160, 2048] + - [25, 2175.47] + - - [64, 2048, 1, 256, 96, 96, 288, 2048] + - [26, 3627.9] + - - [64, 2048, 1, 512, 96, 96, 544, 2048] + - [26, 5468.9] + - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] + - [28, 7433.39] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [28, 9170.99] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [3, 8846.25] + - - [64, 4096, 1, 64, 96, 96, 96, 4096] + - [2, 2195.68] + - - [64, 4096, 1, 128, 96, 96, 160, 4096] + - [5, 3933.22] + - - [64, 4096, 1, 256, 96, 96, 288, 4096] + - [11, 6499.63] + - - [64, 4096, 1, 512, 96, 96, 544, 4096] + - [18, 9777.63] + - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] + - [8, 13251.5] + - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] + - [18, 15685.1] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [5, 16704.3] + - - [128, 64, 1, 64, 160, 160, 128, 96] + - [16, 67.2941] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [7, 124.401] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [0, 196.068] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [19, 286.868] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [29, 413.007] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [27, 530.017] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [24, 585.059] + - - [128, 128, 1, 64, 160, 160, 128, 128] + - [15, 152.034] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [19, 277.18] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [19, 456.05] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [5, 674.052] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [27, 924.666] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [24, 1123.72] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [29, 1216.84] + - - [128, 256, 1, 64, 160, 160, 128, 256] + - [1, 343.736] + - - [128, 256, 1, 128, 160, 160, 160, 256] + - [11, 604.539] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [29, 981.35] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [29, 1454.71] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [27, 1948.45] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [29, 2327.01] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [24, 2473.28] + - - [128, 512, 1, 64, 160, 160, 128, 512] + - [3, 722.9] + - - [128, 512, 1, 128, 160, 160, 160, 512] + - [19, 1272.74] + - - [128, 512, 1, 256, 160, 160, 288, 512] + - [29, 2066.92] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [22, 3012.59] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [24, 3988.87] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [27, 4769.28] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [29, 4977.46] + - - [128, 1024, 1, 64, 160, 160, 128, 1024] + - [11, 1472.71] + - - [128, 1024, 1, 128, 160, 160, 160, 1024] + - [24, 2582.68] + - - [128, 1024, 1, 256, 160, 160, 288, 1024] + - [29, 4210.08] + - - [128, 1024, 1, 512, 160, 160, 544, 1024] + - [24, 6137.6] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [24, 8046.12] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [24, 9449.25] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [29, 10159.3] + - - [128, 2048, 1, 64, 160, 160, 128, 2048] + - [19, 2906.64] + - - [128, 2048, 1, 128, 160, 160, 160, 2048] + - [19, 5100.23] + - - [128, 2048, 1, 256, 160, 160, 288, 2048] + - [27, 8268.71] + - - [128, 2048, 1, 512, 160, 160, 544, 2048] + - [29, 11997.6] + - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] + - [27, 15820.9] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [27, 18992.7] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [11, 18019.1] + - - [128, 4096, 1, 64, 160, 160, 128, 4096] + - [19, 4825.2] + - - [128, 4096, 1, 128, 160, 160, 160, 4096] + - [19, 8484.03] + - - [128, 4096, 1, 256, 160, 160, 288, 4096] + - [11, 13846.8] + - - [128, 4096, 1, 512, 160, 160, 544, 4096] + - [5, 20582.3] + - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] + - [5, 27091.3] + - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] + - [17, 32514.8] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [5, 33128.8] + - - [256, 64, 1, 64, 288, 288, 256, 96] + - [0, 148.04] + - - [256, 64, 1, 128, 288, 288, 256, 160] + - [16, 271.933] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [0, 428.383] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [24, 635.018] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [29, 886.553] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [28, 1096.72] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [27, 1208.16] + - - [256, 128, 1, 64, 288, 288, 256, 128] + - [12, 348.246] + - - [256, 128, 1, 128, 288, 288, 256, 160] + - [11, 634.342] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [11, 982.96] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [29, 1448.43] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [24, 1926.75] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [27, 2324.75] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [29, 2466.64] + - - [256, 256, 1, 64, 288, 288, 256, 256] + - [19, 722.903] + - - [256, 256, 1, 128, 288, 288, 256, 256] + - [9, 1261.63] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [24, 2152.84] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [27, 3097.41] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [27, 4004.08] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [27, 4718.31] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [29, 4981.15] + - - [256, 512, 1, 64, 288, 288, 256, 512] + - [11, 1478.42] + - - [256, 512, 1, 128, 288, 288, 256, 512] + - [19, 2574.75] + - - [256, 512, 1, 256, 288, 288, 288, 512] + - [27, 4185.91] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [29, 6097.44] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [29, 8020.15] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [29, 9391.05] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [29, 10014.5] + - - [256, 1024, 1, 64, 288, 288, 256, 1024] + - [19, 2914.71] + - - [256, 1024, 1, 128, 288, 288, 256, 1024] + - [19, 5103.34] + - - [256, 1024, 1, 256, 288, 288, 288, 1024] + - [29, 7932.44] + - - [256, 1024, 1, 512, 288, 288, 544, 1024] + - [24, 12016.9] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [29, 15848.0] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [29, 18661.3] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [27, 18776.9] + - - [256, 2048, 1, 64, 288, 288, 256, 2048] + - [5, 4737.29] + - - [256, 2048, 1, 128, 288, 288, 256, 2048] + - [19, 8492.59] + - - [256, 2048, 1, 256, 288, 288, 288, 2048] + - [9, 14290.6] + - - [256, 2048, 1, 512, 288, 288, 544, 2048] + - [11, 20837.9] + - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] + - [5, 27317.4] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [5, 32149.7] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [3, 33865.9] + - - [256, 4096, 1, 64, 288, 288, 256, 4096] + - [15, 8499.07] + - - [256, 4096, 1, 128, 288, 288, 256, 4096] + - [24, 14293.6] + - - [256, 4096, 1, 256, 288, 288, 288, 4096] + - [19, 22544.2] + - - [256, 4096, 1, 512, 288, 288, 544, 4096] + - [17, 29125.4] + - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] + - [17, 34461.0] + - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] + - [3, 36147.9] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [3, 38170.8] + - - [384, 64, 1, 64, 416, 416, 384, 96] + - [1, 240.426] + - - [384, 64, 1, 128, 416, 416, 384, 160] + - [15, 435.935] + - - [384, 64, 1, 256, 416, 416, 384, 288] + - [10, 679.862] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [27, 998.719] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [24, 1368.37] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [24, 1659.57] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [26, 1798.48] + - - [384, 128, 1, 64, 416, 416, 384, 128] + - [19, 538.652] + - - [384, 128, 1, 128, 416, 416, 384, 160] + - [11, 980.589] + - - [384, 128, 1, 256, 416, 416, 384, 288] + - [29, 1562.69] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [29, 2290.29] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [25, 2974.84] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [24, 3473.77] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [27, 3719.3] + - - [384, 256, 1, 64, 416, 416, 384, 256] + - [19, 1157.36] + - - [384, 256, 1, 128, 416, 416, 384, 256] + - [3, 2009.08] + - - [384, 256, 1, 256, 416, 416, 384, 288] + - [24, 3112.63] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [29, 4552.4] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [24, 5973.69] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [29, 7023.88] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [29, 7430.08] + - - [384, 512, 1, 64, 416, 416, 384, 512] + - [11, 2239.34] + - - [384, 512, 1, 128, 416, 416, 384, 512] + - [5, 3895.61] + - - [384, 512, 1, 256, 416, 416, 384, 512] + - [27, 6304.02] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [24, 9187.06] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [27, 11949.5] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [24, 13987.7] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [24, 14798.4] + - - [384, 1024, 1, 64, 416, 416, 384, 1024] + - [9, 3709.02] + - - [384, 1024, 1, 128, 416, 416, 384, 1024] + - [5, 6579.24] + - - [384, 1024, 1, 256, 416, 416, 384, 1024] + - [9, 10616.2] + - - [384, 1024, 1, 512, 416, 416, 544, 1024] + - [3, 15411.9] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [5, 20280.6] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [3, 23975.9] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [19, 25768.0] + - - [384, 2048, 1, 64, 416, 416, 384, 2048] + - [19, 6402.68] + - - [384, 2048, 1, 128, 416, 416, 384, 2048] + - [19, 10935.7] + - - [384, 2048, 1, 256, 416, 416, 384, 2048] + - [11, 18285.7] + - - [384, 2048, 1, 512, 416, 416, 544, 2048] + - [19, 25383.0] + - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] + - [19, 31209.6] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [11, 33705.2] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [19, 36970.6] + - - [384, 4096, 1, 64, 416, 416, 384, 4096] + - [15, 12407.6] + - - [384, 4096, 1, 128, 416, 416, 384, 4096] + - [6, 18847.1] + - - [384, 4096, 1, 256, 416, 416, 384, 4096] + - [24, 25508.5] + - - [384, 4096, 1, 512, 416, 416, 544, 4096] + - [29, 31019.7] + - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] + - [29, 32931.3] + - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] + - [27, 35585.6] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [29, 36275.9] + - - [768, 64, 1, 64, 800, 800, 768, 96] + - [6, 500.191] + - - [768, 64, 1, 128, 800, 800, 768, 160] + - [7, 904.588] + - - [768, 64, 1, 256, 800, 800, 768, 288] + - [9, 1407.95] + - - [768, 64, 1, 512, 800, 800, 768, 544] + - [29, 2078.95] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [24, 2727.69] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [23, 3304.43] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [24, 3579.76] + - - [768, 128, 1, 64, 800, 800, 768, 128] + - [19, 1064.72] + - - [768, 128, 1, 128, 800, 800, 768, 160] + - [11, 1926.64] + - - [768, 128, 1, 256, 800, 800, 768, 288] + - [24, 3102.28] + - - [768, 128, 1, 512, 800, 800, 768, 544] + - [24, 4544.59] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [24, 5996.12] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [24, 7004.09] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [29, 7321.33] + - - [768, 256, 1, 64, 800, 800, 768, 256] + - [19, 2131.25] + - - [768, 256, 1, 128, 800, 800, 768, 256] + - [19, 3751.61] + - - [768, 256, 1, 256, 800, 800, 768, 288] + - [29, 6078.68] + - - [768, 256, 1, 512, 800, 800, 768, 544] + - [24, 9161.14] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [27, 11783.8] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [29, 13862.5] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [27, 14794.9] + - - [768, 512, 1, 64, 800, 800, 768, 512] + - [3, 3885.4] + - - [768, 512, 1, 128, 800, 800, 768, 512] + - [5, 6835.71] + - - [768, 512, 1, 256, 800, 800, 768, 512] + - [5, 11035.2] + - - [768, 512, 1, 512, 800, 800, 768, 544] + - [19, 15944.0] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [11, 20272.4] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [19, 24107.3] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [5, 25649.4] + - - [768, 1024, 1, 64, 800, 800, 768, 1024] + - [15, 6407.57] + - - [768, 1024, 1, 128, 800, 800, 768, 1024] + - [7, 10938.0] + - - [768, 1024, 1, 256, 800, 800, 768, 1024] + - [9, 18267.5] + - - [768, 1024, 1, 512, 800, 800, 768, 1024] + - [17, 25837.5] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [19, 31605.2] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [5, 33986.9] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [11, 36786.5] + - - [768, 2048, 1, 64, 800, 800, 768, 2048] + - [16, 12255.0] + - - [768, 2048, 1, 128, 800, 800, 768, 2048] + - [16, 18944.8] + - - [768, 2048, 1, 256, 800, 800, 768, 2048] + - [19, 26147.8] + - - [768, 2048, 1, 512, 800, 800, 768, 2048] + - [5, 31502.6] + - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] + - [29, 33500.5] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [29, 36337.9] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [29, 37489.7] + - - [768, 4096, 1, 64, 800, 800, 768, 4096] + - [21, 16901.1] + - - [768, 4096, 1, 128, 800, 800, 768, 4096] + - [16, 24224.0] + - - [768, 4096, 1, 256, 800, 800, 768, 4096] + - [16, 30558.3] + - - [768, 4096, 1, 512, 800, 800, 768, 4096] + - [29, 33524.2] + - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] + - [29, 36968.0] + - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] + - [29, 38855.1] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [3, 39145.7] + - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] + - [5, 959.499] + - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] + - [1, 1739.4] + - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] + - [9, 2761.83] + - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] + - [25, 4153.09] + - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] + - [24, 5531.84] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [28, 6735.99] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [27, 6695.34] + - - [1536, 128, 1, 64, 1568, 1568, 1536, 128] + - [19, 2156.07] + - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] + - [11, 3877.6] + - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] + - [11, 6205.34] + - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] + - [24, 9034.54] + - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] + - [24, 11692.1] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [24, 13805.5] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [23, 12360.0] + - - [1536, 256, 1, 64, 1568, 1568, 1536, 256] + - [29, 3770.15] + - - [1536, 256, 1, 128, 1568, 1568, 1536, 256] + - [3, 6807.05] + - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] + - [5, 10966.6] + - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] + - [19, 15902.4] + - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] + - [5, 20509.9] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [5, 24440.1] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [5, 25719.9] + - - [1536, 512, 1, 64, 1568, 1568, 1536, 512] + - [29, 6648.81] + - - [1536, 512, 1, 128, 1568, 1568, 1536, 512] + - [19, 12339.1] + - - [1536, 512, 1, 256, 1568, 1568, 1536, 512] + - [5, 18775.1] + - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] + - [5, 25721.9] + - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] + - [17, 31457.1] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [5, 33933.2] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [9, 36789.4] + - - [1536, 1024, 1, 64, 1568, 1568, 1536, 1024] + - [16, 12281.9] + - - [1536, 1024, 1, 128, 1568, 1568, 1536, 1024] + - [16, 19119.2] + - - [1536, 1024, 1, 256, 1568, 1568, 1536, 1024] + - [16, 25930.7] + - - [1536, 1024, 1, 512, 1568, 1568, 1536, 1024] + - [29, 31323.7] + - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] + - [29, 33901.0] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [27, 36396.2] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [27, 37584.2] + - - [1536, 2048, 1, 64, 1568, 1568, 1536, 2048] + - [15, 16976.6] + - - [1536, 2048, 1, 128, 1568, 1568, 1536, 2048] + - [15, 24325.0] + - - [1536, 2048, 1, 256, 1568, 1568, 1536, 2048] + - [29, 30859.2] + - - [1536, 2048, 1, 512, 1568, 1568, 1536, 2048] + - [29, 33834.1] + - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 2048] + - [29, 37161.2] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [27, 38910.3] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [17, 39133.9] + - - [1536, 4096, 1, 64, 1568, 1568, 1536, 4096] + - [21, 21167.6] + - - [1536, 4096, 1, 128, 1568, 1568, 1536, 4096] + - [16, 28283.0] + - - [1536, 4096, 1, 256, 1568, 1568, 1536, 4096] + - [16, 32543.4] + - - [1536, 4096, 1, 512, 1568, 1568, 1536, 4096] + - [29, 36737.0] + - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 4096] + - [29, 39192.2] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] + - [29, 39767.9] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [22, 39218.6] + - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] + - [18, 1860.54] + - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] + - [10, 3224.71] + - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] + - [10, 5030.61] + - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] + - [10, 7427.89] + - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] + - [2, 9882.95] + - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] + - [4, 11801.7] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [2, 12723.2] + - - [3072, 128, 1, 64, 3104, 3104, 3072, 128] + - [5, 4342.66] + - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] + - [19, 7119.02] + - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] + - [5, 11401.4] + - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] + - [5, 16782.7] + - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] + - [5, 21243.6] + - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] + - [5, 24716.9] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [11, 25831.7] + - - [3072, 256, 1, 64, 3104, 3104, 3072, 256] + - [5, 7827.6] + - - [3072, 256, 1, 128, 3104, 3104, 3072, 256] + - [11, 12872.5] + - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] + - [9, 19403.0] + - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] + - [9, 26224.5] + - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] + - [5, 31837.7] + - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] + - [19, 34350.1] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [5, 37051.4] + - - [3072, 512, 1, 64, 3104, 3104, 3072, 512] + - [15, 12697.1] + - - [3072, 512, 1, 128, 3104, 3104, 3072, 512] + - [14, 19472.4] + - - [3072, 512, 1, 256, 3104, 3104, 3072, 512] + - [15, 26275.9] + - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] + - [9, 31548.3] + - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] + - [29, 33573.8] + - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] + - [29, 36265.5] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [10, 36745.4] + - - [3072, 1024, 1, 64, 3104, 3104, 3072, 1024] + - [20, 16943.8] + - - [3072, 1024, 1, 128, 3104, 3104, 3072, 1024] + - [15, 24218.2] + - - [3072, 1024, 1, 256, 3104, 3104, 3072, 1024] + - [16, 30664.2] + - - [3072, 1024, 1, 512, 3104, 3104, 3072, 1024] + - [29, 33449.0] + - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] + - [29, 37220.0] + - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] + - [29, 38894.3] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [9, 39112.3] + - - [3072, 2048, 1, 64, 3104, 3104, 3072, 2048] + - [21, 21172.1] + - - [3072, 2048, 1, 128, 3104, 3104, 3072, 2048] + - [16, 28450.9] + - - [3072, 2048, 1, 256, 3104, 3104, 3072, 2048] + - [29, 32295.4] + - - [3072, 2048, 1, 512, 3104, 3104, 3072, 2048] + - [29, 36761.3] + - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 2048] + - [29, 39147.5] + - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] + - [29, 39866.8] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [19, 39321.9] + - - [3072, 4096, 1, 64, 3104, 3104, 3072, 4096] + - [12, 24300.0] + - - [3072, 4096, 1, 128, 3104, 3104, 3072, 4096] + - [15, 29880.3] + - - [3072, 4096, 1, 256, 3104, 3104, 3072, 4096] + - [15, 35099.8] + - - [3072, 4096, 1, 512, 3104, 3104, 3072, 4096] + - [29, 38357.6] + - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 4096] + - [29, 39665.7] + - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 4096] + - [27, 39551.4] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [29, 39057.0] + - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] + - [18, 1941.12] + - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] + - [18, 3898.94] + - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] + - [8, 6458.35] + - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] + - [2, 9728.69] + - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] + - [8, 13058.7] + - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] + - [10, 15758.3] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [8, 16838.4] + - - [4096, 128, 1, 64, 4128, 4128, 4096, 128] + - [19, 5578.46] + - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] + - [5, 10244.0] + - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] + - [19, 16097.0] + - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] + - [5, 22698.7] + - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] + - [11, 28528.0] + - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] + - [5, 32699.0] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [17, 33836.6] + - - [4096, 256, 1, 64, 4128, 4128, 4096, 256] + - [16, 10047.7] + - - [4096, 256, 1, 128, 4128, 4128, 4096, 256] + - [19, 16085.4] + - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] + - [9, 22574.6] + - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] + - [5, 29203.0] + - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] + - [17, 34067.4] + - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] + - [9, 35986.2] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [17, 38270.9] + - - [4096, 512, 1, 64, 4128, 4128, 4096, 512] + - [16, 14686.2] + - - [4096, 512, 1, 128, 4128, 4128, 4096, 512] + - [7, 21709.2] + - - [4096, 512, 1, 256, 4128, 4128, 4096, 512] + - [19, 28392.2] + - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] + - [19, 33614.1] + - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] + - [19, 35320.3] + - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] + - [9, 37922.5] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [17, 39119.3] + - - [4096, 1024, 1, 64, 4128, 4128, 4096, 1024] + - [13, 18758.5] + - - [4096, 1024, 1, 128, 4128, 4128, 4096, 1024] + - [16, 26225.8] + - - [4096, 1024, 1, 256, 4128, 4128, 4096, 1024] + - [16, 32201.8] + - - [4096, 1024, 1, 512, 4128, 4128, 4096, 1024] + - [29, 35193.0] + - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] + - [29, 38060.3] + - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] + - [29, 39637.9] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [18, 39003.6] + - - [4096, 2048, 1, 64, 4128, 4128, 4096, 2048] + - [21, 22616.4] + - - [4096, 2048, 1, 128, 4128, 4128, 4096, 2048] + - [16, 29748.3] + - - [4096, 2048, 1, 256, 4128, 4128, 4096, 2048] + - [29, 33752.1] + - - [4096, 2048, 1, 512, 4128, 4128, 4096, 2048] + - [29, 37550.1] + - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 2048] + - [29, 39729.6] + - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] + - [27, 40134.6] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [29, 39859.9] + - - [4096, 4096, 1, 64, 4128, 4128, 4096, 4096] + - [21, 23473.8] + - - [4096, 4096, 1, 128, 4128, 4128, 4096, 4096] + - [19, 30001.5] + - - [4096, 4096, 1, 256, 4128, 4128, 4096, 4096] + - [29, 34046.1] + - - [4096, 4096, 1, 512, 4128, 4128, 4096, 4096] + - [29, 38624.9] + - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 4096] + - [16, 39558.4] + - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 4096] + - [27, 39854.3] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [18, 38537.4] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_I8II_BH.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_I8II_BH.yaml new file mode 100644 index 00000000000..16a81bbb541 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_I8II_BH.yaml @@ -0,0 +1,22983 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [3, 34.2627] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [33, 56.9632] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [38, 96.8661] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [66, 147.604] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [49, 210.685] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [77, 263.884] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [60, 294.901] + - - [64, 128, 1, 64, 96, 96, 96, 128] + - [5, 60.4576] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [19, 110.121] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [38, 187.765] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [51, 290.083] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [67, 413.477] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [55, 526.708] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [72, 588.406] + - - [64, 256, 1, 64, 96, 96, 96, 256] + - [4, 140.053] + - - [64, 256, 1, 128, 96, 96, 160, 256] + - [45, 251.668] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [6, 421.92] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [49, 653.981] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [49, 900.114] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [78, 1115.8] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [67, 1218.28] + - - [64, 512, 1, 64, 96, 96, 96, 512] + - [24, 303.671] + - - [64, 512, 1, 128, 96, 96, 160, 512] + - [45, 541.69] + - - [64, 512, 1, 256, 96, 96, 288, 512] + - [48, 907.465] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [48, 1426.15] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [77, 1907.15] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [78, 2284.64] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [77, 2480.46] + - - [64, 1024, 1, 64, 96, 96, 96, 1024] + - [57, 659.378] + - - [64, 1024, 1, 128, 96, 96, 160, 1024] + - [50, 1177.85] + - - [64, 1024, 1, 256, 96, 96, 288, 1024] + - [48, 1952.89] + - - [64, 1024, 1, 512, 96, 96, 544, 1024] + - [77, 2927.71] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [68, 3873.32] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [77, 4632.03] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [48, 4951.87] + - - [64, 2048, 1, 64, 96, 96, 96, 2048] + - [48, 1203.36] + - - [64, 2048, 1, 128, 96, 96, 160, 2048] + - [73, 2170.68] + - - [64, 2048, 1, 256, 96, 96, 288, 2048] + - [61, 3646.03] + - - [64, 2048, 1, 512, 96, 96, 544, 2048] + - [68, 5522.91] + - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] + - [77, 7434.25] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [48, 9040.37] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [77, 9846.33] + - - [64, 4096, 1, 64, 96, 96, 96, 4096] + - [24, 2065.4] + - - [64, 4096, 1, 128, 96, 96, 160, 4096] + - [6, 3749.1] + - - [64, 4096, 1, 256, 96, 96, 288, 4096] + - [6, 6298.94] + - - [64, 4096, 1, 512, 96, 96, 544, 4096] + - [6, 9532.51] + - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] + - [37, 12900.6] + - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] + - [24, 15701.2] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [54, 17171.6] + - - [128, 64, 1, 64, 160, 160, 128, 96] + - [29, 62.2448] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [4, 113.176] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [38, 187.413] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [9, 289.783] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [49, 413.313] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [67, 525.372] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [67, 588.355] + - - [128, 128, 1, 64, 160, 160, 128, 128] + - [12, 160.751] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [29, 290.464] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [27, 472.971] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [40, 689.684] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [62, 947.705] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [49, 1132.03] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [67, 1226.23] + - - [128, 256, 1, 64, 160, 160, 128, 256] + - [41, 373.624] + - - [128, 256, 1, 128, 160, 160, 160, 256] + - [7, 653.93] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [12, 1044.79] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [67, 1531.47] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [49, 1961.56] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [67, 2342.21] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [78, 2514.9] + - - [128, 512, 1, 64, 160, 160, 128, 512] + - [29, 771.867] + - - [128, 512, 1, 128, 160, 160, 160, 512] + - [79, 1348.43] + - - [128, 512, 1, 256, 160, 160, 288, 512] + - [0, 2110.61] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [78, 3077.83] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [49, 4005.55] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [78, 4763.89] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [48, 5050.15] + - - [128, 1024, 1, 64, 160, 160, 128, 1024] + - [76, 1458.63] + - - [128, 1024, 1, 128, 160, 160, 160, 1024] + - [51, 2561.41] + - - [128, 1024, 1, 256, 160, 160, 288, 1024] + - [78, 4183.32] + - - [128, 1024, 1, 512, 160, 160, 544, 1024] + - [49, 6150.57] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [78, 7856.81] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [72, 9498.45] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [67, 10006.9] + - - [128, 2048, 1, 64, 160, 160, 128, 2048] + - [55, 2468.69] + - - [128, 2048, 1, 128, 160, 160, 160, 2048] + - [62, 4447.24] + - - [128, 2048, 1, 256, 160, 160, 288, 2048] + - [69, 7475.67] + - - [128, 2048, 1, 512, 160, 160, 544, 2048] + - [49, 11541.6] + - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] + - [78, 15324.3] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [67, 18241.7] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [49, 19964.0] + - - [128, 4096, 1, 64, 160, 160, 128, 4096] + - [36, 4927.95] + - - [128, 4096, 1, 128, 160, 160, 160, 4096] + - [18, 7467.35] + - - [128, 4096, 1, 256, 160, 160, 288, 4096] + - [48, 14361.0] + - - [128, 4096, 1, 512, 160, 160, 544, 4096] + - [78, 22727.6] + - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] + - [72, 29735.3] + - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] + - [60, 34877.6] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [72, 35003.8] + - - [256, 64, 1, 64, 288, 288, 256, 96] + - [20, 155.322] + - - [256, 64, 1, 128, 288, 288, 256, 160] + - [38, 260.225] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [78, 432.313] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [55, 649.223] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [79, 902.196] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [48, 1119.19] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [48, 1206.43] + - - [256, 128, 1, 64, 288, 288, 256, 128] + - [42, 354.069] + - - [256, 128, 1, 128, 288, 288, 256, 160] + - [10, 621.931] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [74, 1039.48] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [61, 1520.78] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [49, 1985.35] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [60, 2329.93] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [77, 2491.88] + - - [256, 256, 1, 64, 288, 288, 256, 256] + - [22, 770.87] + - - [256, 256, 1, 128, 288, 288, 256, 256] + - [23, 1347.57] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [61, 2169.0] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [77, 3158.07] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [66, 4069.42] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [60, 4735.82] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [48, 5039.53] + - - [256, 512, 1, 64, 288, 288, 256, 512] + - [64, 1438.62] + - - [256, 512, 1, 128, 288, 288, 256, 512] + - [73, 2554.0] + - - [256, 512, 1, 256, 288, 288, 288, 512] + - [73, 4167.23] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [59, 6116.93] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [60, 7836.62] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [66, 9469.97] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [60, 9954.22] + - - [256, 1024, 1, 64, 288, 288, 256, 1024] + - [61, 2458.56] + - - [256, 1024, 1, 128, 288, 288, 256, 1024] + - [61, 4421.46] + - - [256, 1024, 1, 256, 288, 288, 288, 1024] + - [61, 7397.36] + - - [256, 1024, 1, 512, 288, 288, 544, 1024] + - [60, 11170.8] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [78, 15163.3] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [50, 18143.1] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [77, 19584.5] + - - [256, 2048, 1, 64, 288, 288, 256, 2048] + - [63, 4585.19] + - - [256, 2048, 1, 128, 288, 288, 256, 2048] + - [80, 7421.08] + - - [256, 2048, 1, 256, 288, 288, 288, 2048] + - [61, 14439.8] + - - [256, 2048, 1, 512, 288, 288, 544, 2048] + - [48, 21096.8] + - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] + - [71, 27582.8] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [48, 33168.9] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [59, 35456.9] + - - [256, 4096, 1, 64, 288, 288, 256, 4096] + - [79, 9041.88] + - - [256, 4096, 1, 128, 288, 288, 256, 4096] + - [50, 14906.5] + - - [256, 4096, 1, 256, 288, 288, 288, 4096] + - [73, 21718.1] + - - [256, 4096, 1, 512, 288, 288, 544, 4096] + - [26, 27834.5] + - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] + - [9, 33051.5] + - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] + - [8, 35593.3] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [26, 38502.0] + - - [384, 64, 1, 64, 416, 416, 384, 96] + - [13, 242.951] + - - [384, 64, 1, 128, 416, 416, 384, 160] + - [14, 415.555] + - - [384, 64, 1, 256, 416, 416, 384, 288] + - [49, 654.542] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [54, 1028.44] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [66, 1391.84] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [55, 1679.07] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [48, 1821.93] + - - [384, 128, 1, 64, 416, 416, 384, 128] + - [18, 571.327] + - - [384, 128, 1, 128, 416, 416, 384, 160] + - [27, 993.601] + - - [384, 128, 1, 256, 416, 416, 384, 288] + - [51, 1605.78] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [49, 2332.11] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [78, 2963.82] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [59, 3521.92] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [59, 3760.87] + - - [384, 256, 1, 64, 416, 416, 384, 256] + - [74, 1121.87] + - - [384, 256, 1, 128, 416, 416, 384, 256] + - [2, 1932.27] + - - [384, 256, 1, 256, 416, 416, 384, 288] + - [55, 3073.5] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [67, 4538.47] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [49, 5935.69] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [71, 7048.27] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [60, 7494.44] + - - [384, 512, 1, 64, 416, 416, 384, 512] + - [61, 2006.84] + - - [384, 512, 1, 128, 416, 416, 384, 512] + - [49, 3571.65] + - - [384, 512, 1, 256, 416, 416, 384, 512] + - [55, 5917.19] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [49, 8599.29] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [77, 11625.3] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [60, 13929.3] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [72, 14765.4] + - - [384, 1024, 1, 64, 416, 416, 384, 1024] + - [11, 3747.14] + - - [384, 1024, 1, 128, 416, 416, 384, 1024] + - [73, 6217.64] + - - [384, 1024, 1, 256, 416, 416, 384, 1024] + - [79, 11614.5] + - - [384, 1024, 1, 512, 416, 416, 544, 1024] + - [44, 16918.2] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [33, 21694.7] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [48, 25704.9] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [59, 27010.1] + - - [384, 2048, 1, 64, 416, 416, 384, 2048] + - [73, 7976.52] + - - [384, 2048, 1, 128, 416, 416, 384, 2048] + - [24, 12928.8] + - - [384, 2048, 1, 256, 416, 416, 384, 2048] + - [38, 19393.8] + - - [384, 2048, 1, 512, 416, 416, 544, 2048] + - [37, 26301.7] + - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] + - [38, 32067.3] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [40, 34583.3] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [8, 37104.1] + - - [384, 4096, 1, 64, 416, 416, 384, 4096] + - [81, 12179.5] + - - [384, 4096, 1, 128, 416, 416, 384, 4096] + - [40, 18852.6] + - - [384, 4096, 1, 256, 416, 416, 384, 4096] + - [73, 25306.6] + - - [384, 4096, 1, 512, 416, 416, 544, 4096] + - [60, 30666.7] + - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] + - [50, 33311.6] + - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] + - [48, 35958.0] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [68, 36635.3] + - - [768, 64, 1, 64, 800, 800, 768, 96] + - [13, 505.176] + - - [768, 64, 1, 128, 800, 800, 768, 160] + - [14, 863.499] + - - [768, 64, 1, 256, 800, 800, 768, 288] + - [48, 1380.46] + - - [768, 64, 1, 512, 800, 800, 768, 544] + - [48, 2091.06] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [78, 2849.71] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [60, 3433.27] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [59, 3661.69] + - - [768, 128, 1, 64, 800, 800, 768, 128] + - [62, 1110.39] + - - [768, 128, 1, 128, 800, 800, 768, 160] + - [56, 1951.44] + - - [768, 128, 1, 256, 800, 800, 768, 288] + - [62, 3175.1] + - - [768, 128, 1, 512, 800, 800, 768, 544] + - [72, 4628.2] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [49, 6002.59] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [72, 7020.5] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [72, 7485.8] + - - [768, 256, 1, 64, 800, 800, 768, 256] + - [73, 2001.42] + - - [768, 256, 1, 128, 800, 800, 768, 256] + - [51, 3542.99] + - - [768, 256, 1, 256, 800, 800, 768, 288] + - [56, 5872.32] + - - [768, 256, 1, 512, 800, 800, 768, 544] + - [50, 8530.79] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [48, 11401.4] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [48, 13724.2] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [55, 15041.8] + - - [768, 512, 1, 64, 800, 800, 768, 512] + - [2, 3748.82] + - - [768, 512, 1, 128, 800, 800, 768, 512] + - [39, 6111.92] + - - [768, 512, 1, 256, 800, 800, 768, 512] + - [24, 10137.3] + - - [768, 512, 1, 512, 800, 800, 768, 544] + - [54, 15728.6] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [22, 20967.2] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [49, 25606.8] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [49, 26324.1] + - - [768, 1024, 1, 64, 800, 800, 768, 1024] + - [77, 7600.7] + - - [768, 1024, 1, 128, 800, 800, 768, 1024] + - [39, 12523.4] + - - [768, 1024, 1, 256, 800, 800, 768, 1024] + - [24, 19886.1] + - - [768, 1024, 1, 512, 800, 800, 768, 1024] + - [24, 26775.8] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [17, 32280.7] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [39, 34310.8] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [26, 37191.5] + - - [768, 2048, 1, 64, 800, 800, 768, 2048] + - [77, 11795.6] + - - [768, 2048, 1, 128, 800, 800, 768, 2048] + - [73, 19079.5] + - - [768, 2048, 1, 256, 800, 800, 768, 2048] + - [73, 25839.3] + - - [768, 2048, 1, 512, 800, 800, 768, 2048] + - [59, 31127.8] + - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] + - [2, 33297.1] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [71, 36502.0] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [71, 38152.0] + - - [768, 4096, 1, 64, 800, 800, 768, 4096] + - [81, 16746.5] + - - [768, 4096, 1, 128, 800, 800, 768, 4096] + - [61, 24034.8] + - - [768, 4096, 1, 256, 800, 800, 768, 4096] + - [71, 30200.9] + - - [768, 4096, 1, 512, 800, 800, 768, 4096] + - [26, 33339.1] + - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] + - [26, 36892.0] + - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] + - [26, 38878.4] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [40, 39582.1] + - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] + - [2, 947.081] + - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] + - [50, 1670.37] + - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] + - [54, 2804.93] + - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] + - [59, 4234.18] + - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] + - [77, 5631.51] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [71, 6717.16] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [77, 7358.7] + - - [1536, 128, 1, 64, 1568, 1568, 1536, 128] + - [9, 1962.09] + - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] + - [10, 3492.83] + - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] + - [61, 5653.97] + - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] + - [78, 8494.09] + - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] + - [48, 11357.7] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [78, 13691.0] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [78, 14899.9] + - - [1536, 256, 1, 64, 1568, 1568, 1536, 256] + - [2, 3742.69] + - - [1536, 256, 1, 128, 1568, 1568, 1536, 256] + - [26, 6331.82] + - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] + - [39, 10391.6] + - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] + - [28, 16107.4] + - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] + - [48, 21539.2] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [26, 24863.6] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [31, 26156.5] + - - [1536, 512, 1, 64, 1568, 1568, 1536, 512] + - [73, 8018.46] + - - [1536, 512, 1, 128, 1568, 1568, 1536, 512] + - [61, 13105.5] + - - [1536, 512, 1, 256, 1568, 1568, 1536, 512] + - [24, 19872.4] + - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] + - [37, 26899.2] + - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] + - [8, 31927.5] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [37, 34203.6] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [8, 37215.5] + - - [1536, 1024, 1, 64, 1568, 1568, 1536, 1024] + - [79, 11781.8] + - - [1536, 1024, 1, 128, 1568, 1568, 1536, 1024] + - [79, 19117.6] + - - [1536, 1024, 1, 256, 1568, 1568, 1536, 1024] + - [46, 25847.6] + - - [1536, 1024, 1, 512, 1568, 1568, 1536, 1024] + - [21, 30898.5] + - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] + - [34, 33435.4] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [71, 36434.3] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [71, 38210.3] + - - [1536, 2048, 1, 64, 1568, 1568, 1536, 2048] + - [75, 16816.5] + - - [1536, 2048, 1, 128, 1568, 1568, 1536, 2048] + - [59, 24024.7] + - - [1536, 2048, 1, 256, 1568, 1568, 1536, 2048] + - [61, 30174.8] + - - [1536, 2048, 1, 512, 1568, 1568, 1536, 2048] + - [73, 33507.0] + - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 2048] + - [27, 36943.2] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [27, 38954.3] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [9, 39796.6] + - - [1536, 4096, 1, 64, 1568, 1568, 1536, 4096] + - [75, 21348.5] + - - [1536, 4096, 1, 128, 1568, 1568, 1536, 4096] + - [73, 28464.1] + - - [1536, 4096, 1, 256, 1568, 1568, 1536, 4096] + - [46, 32408.7] + - - [1536, 4096, 1, 512, 1568, 1568, 1536, 4096] + - [39, 36787.1] + - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 4096] + - [26, 39067.0] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] + - [71, 39153.4] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [8, 39715.5] + - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] + - [4, 1768.51] + - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] + - [45, 3140.23] + - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] + - [24, 5187.77] + - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] + - [6, 7705.4] + - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] + - [37, 10029.2] + - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] + - [26, 11978.4] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [17, 12881.2] + - - [3072, 128, 1, 64, 3104, 3104, 3072, 128] + - [30, 3765.65] + - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] + - [18, 6083.11] + - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] + - [27, 9996.38] + - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] + - [46, 16366.7] + - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] + - [8, 21266.2] + - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] + - [8, 24924.4] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [32, 26196.5] + - - [3072, 256, 1, 64, 3104, 3104, 3072, 256] + - [61, 7920.04] + - - [3072, 256, 1, 128, 3104, 3104, 3072, 256] + - [39, 13041.0] + - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] + - [25, 19850.8] + - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] + - [38, 26351.6] + - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] + - [16, 31923.7] + - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] + - [37, 34469.3] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [8, 37234.0] + - - [3072, 512, 1, 64, 3104, 3104, 3072, 512] + - [64, 12213.5] + - - [3072, 512, 1, 128, 3104, 3104, 3072, 512] + - [68, 19081.3] + - - [3072, 512, 1, 256, 3104, 3104, 3072, 512] + - [15, 25816.1] + - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] + - [61, 30738.1] + - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] + - [79, 33456.2] + - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] + - [60, 36115.7] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [59, 37885.0] + - - [3072, 1024, 1, 64, 3104, 3104, 3072, 1024] + - [64, 16731.2] + - - [3072, 1024, 1, 128, 3104, 3104, 3072, 1024] + - [71, 24119.7] + - - [3072, 1024, 1, 256, 3104, 3104, 3072, 1024] + - [79, 30248.5] + - - [3072, 1024, 1, 512, 3104, 3104, 3072, 1024] + - [47, 33607.0] + - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] + - [8, 36806.5] + - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] + - [8, 38860.1] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [9, 39798.1] + - - [3072, 2048, 1, 64, 3104, 3104, 3072, 2048] + - [75, 21318.0] + - - [3072, 2048, 1, 128, 3104, 3104, 3072, 2048] + - [77, 28415.9] + - - [3072, 2048, 1, 256, 3104, 3104, 3072, 2048] + - [39, 32521.9] + - - [3072, 2048, 1, 512, 3104, 3104, 3072, 2048] + - [26, 36773.7] + - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 2048] + - [40, 39002.4] + - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] + - [60, 39209.6] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [26, 39950.9] + - - [3072, 4096, 1, 64, 3104, 3104, 3072, 4096] + - [53, 6862.26] + - - [3072, 4096, 1, 128, 3104, 3104, 3072, 4096] + - [53, 12876.3] + - - [3072, 4096, 1, 256, 3104, 3104, 3072, 4096] + - [53, 23076.2] + - - [3072, 4096, 1, 512, 3104, 3104, 3072, 4096] + - [40, 34836.5] + - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 4096] + - [25, 38469.0] + - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 4096] + - [39, 40068.0] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [26, 40246.6] + - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] + - [43, 2248.66] + - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] + - [0, 4086.52] + - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] + - [77, 6752.09] + - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] + - [77, 10108.3] + - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] + - [6, 13353.0] + - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] + - [6, 15962.6] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [45, 16985.0] + - - [4096, 128, 1, 64, 4128, 4128, 4096, 128] + - [35, 5693.95] + - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] + - [1, 8636.95] + - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] + - [26, 14180.4] + - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] + - [61, 21198.4] + - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] + - [47, 28723.5] + - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] + - [8, 33544.0] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [8, 34958.8] + - - [4096, 256, 1, 64, 4128, 4128, 4096, 256] + - [77, 9707.63] + - - [4096, 256, 1, 128, 4128, 4128, 4096, 256] + - [50, 15822.0] + - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] + - [61, 22295.3] + - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] + - [25, 28417.9] + - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] + - [8, 33036.2] + - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] + - [26, 35523.8] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [9, 38749.7] + - - [4096, 512, 1, 64, 4128, 4128, 4096, 512] + - [70, 13761.7] + - - [4096, 512, 1, 128, 4128, 4128, 4096, 512] + - [50, 20922.5] + - - [4096, 512, 1, 256, 4128, 4128, 4096, 512] + - [61, 27401.2] + - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] + - [59, 32510.1] + - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] + - [39, 35154.5] + - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] + - [8, 37819.2] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [27, 39756.8] + - - [4096, 1024, 1, 64, 4128, 4128, 4096, 1024] + - [75, 18664.7] + - - [4096, 1024, 1, 128, 4128, 4128, 4096, 1024] + - [71, 26076.9] + - - [4096, 1024, 1, 256, 4128, 4128, 4096, 1024] + - [73, 31845.8] + - - [4096, 1024, 1, 512, 4128, 4128, 4096, 1024] + - [46, 35125.2] + - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] + - [8, 37834.5] + - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] + - [39, 39550.7] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [26, 40028.1] + - - [4096, 2048, 1, 64, 4128, 4128, 4096, 2048] + - [76, 22394.8] + - - [4096, 2048, 1, 128, 4128, 4128, 4096, 2048] + - [73, 29770.8] + - - [4096, 2048, 1, 256, 4128, 4128, 4096, 2048] + - [46, 33621.7] + - - [4096, 2048, 1, 512, 4128, 4128, 4096, 2048] + - [46, 37010.3] + - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 2048] + - [47, 38908.1] + - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] + - [26, 39496.0] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [8, 39951.5] + - - [4096, 4096, 1, 64, 4128, 4128, 4096, 4096] + - [58, 7088.68] + - - [4096, 4096, 1, 128, 4128, 4128, 4096, 4096] + - [52, 13399.6] + - - [4096, 4096, 1, 256, 4128, 4128, 4096, 4096] + - [65, 24382.5] + - - [4096, 4096, 1, 512, 4128, 4128, 4096, 4096] + - [38, 36186.0] + - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 4096] + - [40, 38939.5] + - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 4096] + - [46, 39122.0] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [26, 40501.1] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_I8II_BH_GB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_I8II_BH_GB.yaml new file mode 100644 index 00000000000..14a6ba0db35 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_I8II_BH_GB.yaml @@ -0,0 +1,22983 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4096 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 128 + LSPA: 16 + LSPB: 16 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 16384 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 12288 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: false + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [3, 34.2627] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [33, 56.9632] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [38, 96.8661] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [66, 147.604] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [49, 210.685] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [77, 263.884] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [60, 294.901] + - - [64, 128, 1, 64, 96, 96, 96, 128] + - [5, 60.4576] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [19, 110.121] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [38, 187.765] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [51, 290.083] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [67, 413.477] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [55, 526.708] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [72, 588.406] + - - [64, 256, 1, 64, 96, 96, 96, 256] + - [4, 140.053] + - - [64, 256, 1, 128, 96, 96, 160, 256] + - [45, 251.668] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [6, 421.92] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [49, 653.981] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [49, 900.114] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [78, 1115.8] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [67, 1218.28] + - - [64, 512, 1, 64, 96, 96, 96, 512] + - [24, 303.671] + - - [64, 512, 1, 128, 96, 96, 160, 512] + - [45, 541.69] + - - [64, 512, 1, 256, 96, 96, 288, 512] + - [48, 907.465] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [48, 1426.15] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [77, 1907.15] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [78, 2284.64] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [77, 2480.46] + - - [64, 1024, 1, 64, 96, 96, 96, 1024] + - [57, 659.378] + - - [64, 1024, 1, 128, 96, 96, 160, 1024] + - [50, 1177.85] + - - [64, 1024, 1, 256, 96, 96, 288, 1024] + - [48, 1952.89] + - - [64, 1024, 1, 512, 96, 96, 544, 1024] + - [77, 2927.71] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [68, 3873.32] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [77, 4632.03] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [48, 4951.87] + - - [64, 2048, 1, 64, 96, 96, 96, 2048] + - [48, 1203.36] + - - [64, 2048, 1, 128, 96, 96, 160, 2048] + - [73, 2170.68] + - - [64, 2048, 1, 256, 96, 96, 288, 2048] + - [61, 3646.03] + - - [64, 2048, 1, 512, 96, 96, 544, 2048] + - [68, 5522.91] + - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] + - [77, 7434.25] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [48, 9040.37] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [77, 9846.33] + - - [64, 4096, 1, 64, 96, 96, 96, 4096] + - [24, 2065.4] + - - [64, 4096, 1, 128, 96, 96, 160, 4096] + - [6, 3749.1] + - - [64, 4096, 1, 256, 96, 96, 288, 4096] + - [6, 6298.94] + - - [64, 4096, 1, 512, 96, 96, 544, 4096] + - [6, 9532.51] + - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] + - [37, 12900.6] + - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] + - [24, 15701.2] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [54, 17171.6] + - - [128, 64, 1, 64, 160, 160, 128, 96] + - [29, 62.2448] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [4, 113.176] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [38, 187.413] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [9, 289.783] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [49, 413.313] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [67, 525.372] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [67, 588.355] + - - [128, 128, 1, 64, 160, 160, 128, 128] + - [12, 160.751] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [29, 290.464] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [27, 472.971] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [40, 689.684] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [62, 947.705] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [49, 1132.03] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [67, 1226.23] + - - [128, 256, 1, 64, 160, 160, 128, 256] + - [41, 373.624] + - - [128, 256, 1, 128, 160, 160, 160, 256] + - [7, 653.93] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [12, 1044.79] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [67, 1531.47] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [49, 1961.56] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [67, 2342.21] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [78, 2514.9] + - - [128, 512, 1, 64, 160, 160, 128, 512] + - [29, 771.867] + - - [128, 512, 1, 128, 160, 160, 160, 512] + - [79, 1348.43] + - - [128, 512, 1, 256, 160, 160, 288, 512] + - [0, 2110.61] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [78, 3077.83] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [49, 4005.55] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [78, 4763.89] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [48, 5050.15] + - - [128, 1024, 1, 64, 160, 160, 128, 1024] + - [76, 1458.63] + - - [128, 1024, 1, 128, 160, 160, 160, 1024] + - [51, 2561.41] + - - [128, 1024, 1, 256, 160, 160, 288, 1024] + - [78, 4183.32] + - - [128, 1024, 1, 512, 160, 160, 544, 1024] + - [49, 6150.57] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [78, 7856.81] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [72, 9498.45] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [67, 10006.9] + - - [128, 2048, 1, 64, 160, 160, 128, 2048] + - [55, 2468.69] + - - [128, 2048, 1, 128, 160, 160, 160, 2048] + - [62, 4447.24] + - - [128, 2048, 1, 256, 160, 160, 288, 2048] + - [69, 7475.67] + - - [128, 2048, 1, 512, 160, 160, 544, 2048] + - [49, 11541.6] + - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] + - [78, 15324.3] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [67, 18241.7] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [49, 19964.0] + - - [128, 4096, 1, 64, 160, 160, 128, 4096] + - [36, 4927.95] + - - [128, 4096, 1, 128, 160, 160, 160, 4096] + - [18, 7467.35] + - - [128, 4096, 1, 256, 160, 160, 288, 4096] + - [48, 14361.0] + - - [128, 4096, 1, 512, 160, 160, 544, 4096] + - [78, 22727.6] + - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] + - [72, 29735.3] + - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] + - [60, 34877.6] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [72, 35003.8] + - - [256, 64, 1, 64, 288, 288, 256, 96] + - [20, 155.322] + - - [256, 64, 1, 128, 288, 288, 256, 160] + - [38, 260.225] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [78, 432.313] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [55, 649.223] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [79, 902.196] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [48, 1119.19] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [48, 1206.43] + - - [256, 128, 1, 64, 288, 288, 256, 128] + - [42, 354.069] + - - [256, 128, 1, 128, 288, 288, 256, 160] + - [10, 621.931] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [74, 1039.48] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [61, 1520.78] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [49, 1985.35] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [60, 2329.93] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [77, 2491.88] + - - [256, 256, 1, 64, 288, 288, 256, 256] + - [22, 770.87] + - - [256, 256, 1, 128, 288, 288, 256, 256] + - [23, 1347.57] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [61, 2169.0] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [77, 3158.07] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [66, 4069.42] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [60, 4735.82] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [48, 5039.53] + - - [256, 512, 1, 64, 288, 288, 256, 512] + - [64, 1438.62] + - - [256, 512, 1, 128, 288, 288, 256, 512] + - [73, 2554.0] + - - [256, 512, 1, 256, 288, 288, 288, 512] + - [73, 4167.23] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [59, 6116.93] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [60, 7836.62] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [66, 9469.97] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [60, 9954.22] + - - [256, 1024, 1, 64, 288, 288, 256, 1024] + - [61, 2458.56] + - - [256, 1024, 1, 128, 288, 288, 256, 1024] + - [61, 4421.46] + - - [256, 1024, 1, 256, 288, 288, 288, 1024] + - [61, 7397.36] + - - [256, 1024, 1, 512, 288, 288, 544, 1024] + - [60, 11170.8] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [78, 15163.3] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [50, 18143.1] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [77, 19584.5] + - - [256, 2048, 1, 64, 288, 288, 256, 2048] + - [63, 4585.19] + - - [256, 2048, 1, 128, 288, 288, 256, 2048] + - [80, 7421.08] + - - [256, 2048, 1, 256, 288, 288, 288, 2048] + - [61, 14439.8] + - - [256, 2048, 1, 512, 288, 288, 544, 2048] + - [48, 21096.8] + - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] + - [71, 27582.8] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [48, 33168.9] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [59, 35456.9] + - - [256, 4096, 1, 64, 288, 288, 256, 4096] + - [79, 9041.88] + - - [256, 4096, 1, 128, 288, 288, 256, 4096] + - [50, 14906.5] + - - [256, 4096, 1, 256, 288, 288, 288, 4096] + - [73, 21718.1] + - - [256, 4096, 1, 512, 288, 288, 544, 4096] + - [26, 27834.5] + - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] + - [9, 33051.5] + - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] + - [8, 35593.3] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [26, 38502.0] + - - [384, 64, 1, 64, 416, 416, 384, 96] + - [13, 242.951] + - - [384, 64, 1, 128, 416, 416, 384, 160] + - [14, 415.555] + - - [384, 64, 1, 256, 416, 416, 384, 288] + - [49, 654.542] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [54, 1028.44] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [66, 1391.84] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [55, 1679.07] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [48, 1821.93] + - - [384, 128, 1, 64, 416, 416, 384, 128] + - [18, 571.327] + - - [384, 128, 1, 128, 416, 416, 384, 160] + - [27, 993.601] + - - [384, 128, 1, 256, 416, 416, 384, 288] + - [51, 1605.78] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [49, 2332.11] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [78, 2963.82] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [59, 3521.92] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [59, 3760.87] + - - [384, 256, 1, 64, 416, 416, 384, 256] + - [74, 1121.87] + - - [384, 256, 1, 128, 416, 416, 384, 256] + - [2, 1932.27] + - - [384, 256, 1, 256, 416, 416, 384, 288] + - [55, 3073.5] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [67, 4538.47] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [49, 5935.69] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [71, 7048.27] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [60, 7494.44] + - - [384, 512, 1, 64, 416, 416, 384, 512] + - [61, 2006.84] + - - [384, 512, 1, 128, 416, 416, 384, 512] + - [49, 3571.65] + - - [384, 512, 1, 256, 416, 416, 384, 512] + - [55, 5917.19] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [49, 8599.29] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [77, 11625.3] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [60, 13929.3] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [72, 14765.4] + - - [384, 1024, 1, 64, 416, 416, 384, 1024] + - [11, 3747.14] + - - [384, 1024, 1, 128, 416, 416, 384, 1024] + - [73, 6217.64] + - - [384, 1024, 1, 256, 416, 416, 384, 1024] + - [79, 11614.5] + - - [384, 1024, 1, 512, 416, 416, 544, 1024] + - [44, 16918.2] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [33, 21694.7] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [48, 25704.9] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [59, 27010.1] + - - [384, 2048, 1, 64, 416, 416, 384, 2048] + - [73, 7976.52] + - - [384, 2048, 1, 128, 416, 416, 384, 2048] + - [24, 12928.8] + - - [384, 2048, 1, 256, 416, 416, 384, 2048] + - [38, 19393.8] + - - [384, 2048, 1, 512, 416, 416, 544, 2048] + - [37, 26301.7] + - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] + - [38, 32067.3] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [40, 34583.3] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [8, 37104.1] + - - [384, 4096, 1, 64, 416, 416, 384, 4096] + - [81, 12179.5] + - - [384, 4096, 1, 128, 416, 416, 384, 4096] + - [40, 18852.6] + - - [384, 4096, 1, 256, 416, 416, 384, 4096] + - [73, 25306.6] + - - [384, 4096, 1, 512, 416, 416, 544, 4096] + - [60, 30666.7] + - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] + - [50, 33311.6] + - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] + - [48, 35958.0] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [68, 36635.3] + - - [768, 64, 1, 64, 800, 800, 768, 96] + - [13, 505.176] + - - [768, 64, 1, 128, 800, 800, 768, 160] + - [14, 863.499] + - - [768, 64, 1, 256, 800, 800, 768, 288] + - [48, 1380.46] + - - [768, 64, 1, 512, 800, 800, 768, 544] + - [48, 2091.06] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [78, 2849.71] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [60, 3433.27] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [59, 3661.69] + - - [768, 128, 1, 64, 800, 800, 768, 128] + - [62, 1110.39] + - - [768, 128, 1, 128, 800, 800, 768, 160] + - [56, 1951.44] + - - [768, 128, 1, 256, 800, 800, 768, 288] + - [62, 3175.1] + - - [768, 128, 1, 512, 800, 800, 768, 544] + - [72, 4628.2] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [49, 6002.59] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [72, 7020.5] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [72, 7485.8] + - - [768, 256, 1, 64, 800, 800, 768, 256] + - [73, 2001.42] + - - [768, 256, 1, 128, 800, 800, 768, 256] + - [51, 3542.99] + - - [768, 256, 1, 256, 800, 800, 768, 288] + - [56, 5872.32] + - - [768, 256, 1, 512, 800, 800, 768, 544] + - [50, 8530.79] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [48, 11401.4] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [48, 13724.2] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [55, 15041.8] + - - [768, 512, 1, 64, 800, 800, 768, 512] + - [2, 3748.82] + - - [768, 512, 1, 128, 800, 800, 768, 512] + - [39, 6111.92] + - - [768, 512, 1, 256, 800, 800, 768, 512] + - [24, 10137.3] + - - [768, 512, 1, 512, 800, 800, 768, 544] + - [54, 15728.6] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [22, 20967.2] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [49, 25606.8] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [49, 26324.1] + - - [768, 1024, 1, 64, 800, 800, 768, 1024] + - [77, 7600.7] + - - [768, 1024, 1, 128, 800, 800, 768, 1024] + - [39, 12523.4] + - - [768, 1024, 1, 256, 800, 800, 768, 1024] + - [24, 19886.1] + - - [768, 1024, 1, 512, 800, 800, 768, 1024] + - [24, 26775.8] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [17, 32280.7] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [39, 34310.8] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [26, 37191.5] + - - [768, 2048, 1, 64, 800, 800, 768, 2048] + - [77, 11795.6] + - - [768, 2048, 1, 128, 800, 800, 768, 2048] + - [73, 19079.5] + - - [768, 2048, 1, 256, 800, 800, 768, 2048] + - [73, 25839.3] + - - [768, 2048, 1, 512, 800, 800, 768, 2048] + - [59, 31127.8] + - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] + - [2, 33297.1] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [71, 36502.0] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [71, 38152.0] + - - [768, 4096, 1, 64, 800, 800, 768, 4096] + - [81, 16746.5] + - - [768, 4096, 1, 128, 800, 800, 768, 4096] + - [61, 24034.8] + - - [768, 4096, 1, 256, 800, 800, 768, 4096] + - [71, 30200.9] + - - [768, 4096, 1, 512, 800, 800, 768, 4096] + - [26, 33339.1] + - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] + - [26, 36892.0] + - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] + - [26, 38878.4] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [40, 39582.1] + - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] + - [2, 947.081] + - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] + - [50, 1670.37] + - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] + - [54, 2804.93] + - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] + - [59, 4234.18] + - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] + - [77, 5631.51] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [71, 6717.16] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [77, 7358.7] + - - [1536, 128, 1, 64, 1568, 1568, 1536, 128] + - [9, 1962.09] + - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] + - [10, 3492.83] + - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] + - [61, 5653.97] + - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] + - [78, 8494.09] + - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] + - [48, 11357.7] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [78, 13691.0] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [78, 14899.9] + - - [1536, 256, 1, 64, 1568, 1568, 1536, 256] + - [2, 3742.69] + - - [1536, 256, 1, 128, 1568, 1568, 1536, 256] + - [26, 6331.82] + - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] + - [39, 10391.6] + - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] + - [28, 16107.4] + - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] + - [48, 21539.2] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [26, 24863.6] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [31, 26156.5] + - - [1536, 512, 1, 64, 1568, 1568, 1536, 512] + - [73, 8018.46] + - - [1536, 512, 1, 128, 1568, 1568, 1536, 512] + - [61, 13105.5] + - - [1536, 512, 1, 256, 1568, 1568, 1536, 512] + - [24, 19872.4] + - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] + - [37, 26899.2] + - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] + - [8, 31927.5] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [37, 34203.6] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [8, 37215.5] + - - [1536, 1024, 1, 64, 1568, 1568, 1536, 1024] + - [79, 11781.8] + - - [1536, 1024, 1, 128, 1568, 1568, 1536, 1024] + - [79, 19117.6] + - - [1536, 1024, 1, 256, 1568, 1568, 1536, 1024] + - [46, 25847.6] + - - [1536, 1024, 1, 512, 1568, 1568, 1536, 1024] + - [21, 30898.5] + - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] + - [34, 33435.4] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [71, 36434.3] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [71, 38210.3] + - - [1536, 2048, 1, 64, 1568, 1568, 1536, 2048] + - [75, 16816.5] + - - [1536, 2048, 1, 128, 1568, 1568, 1536, 2048] + - [59, 24024.7] + - - [1536, 2048, 1, 256, 1568, 1568, 1536, 2048] + - [61, 30174.8] + - - [1536, 2048, 1, 512, 1568, 1568, 1536, 2048] + - [73, 33507.0] + - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 2048] + - [27, 36943.2] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [27, 38954.3] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [9, 39796.6] + - - [1536, 4096, 1, 64, 1568, 1568, 1536, 4096] + - [75, 21348.5] + - - [1536, 4096, 1, 128, 1568, 1568, 1536, 4096] + - [73, 28464.1] + - - [1536, 4096, 1, 256, 1568, 1568, 1536, 4096] + - [46, 32408.7] + - - [1536, 4096, 1, 512, 1568, 1568, 1536, 4096] + - [39, 36787.1] + - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 4096] + - [26, 39067.0] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] + - [71, 39153.4] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [8, 39715.5] + - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] + - [4, 1768.51] + - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] + - [45, 3140.23] + - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] + - [24, 5187.77] + - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] + - [6, 7705.4] + - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] + - [37, 10029.2] + - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] + - [26, 11978.4] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [17, 12881.2] + - - [3072, 128, 1, 64, 3104, 3104, 3072, 128] + - [30, 3765.65] + - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] + - [18, 6083.11] + - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] + - [27, 9996.38] + - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] + - [46, 16366.7] + - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] + - [8, 21266.2] + - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] + - [8, 24924.4] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [32, 26196.5] + - - [3072, 256, 1, 64, 3104, 3104, 3072, 256] + - [61, 7920.04] + - - [3072, 256, 1, 128, 3104, 3104, 3072, 256] + - [39, 13041.0] + - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] + - [25, 19850.8] + - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] + - [38, 26351.6] + - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] + - [16, 31923.7] + - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] + - [37, 34469.3] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [8, 37234.0] + - - [3072, 512, 1, 64, 3104, 3104, 3072, 512] + - [64, 12213.5] + - - [3072, 512, 1, 128, 3104, 3104, 3072, 512] + - [68, 19081.3] + - - [3072, 512, 1, 256, 3104, 3104, 3072, 512] + - [15, 25816.1] + - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] + - [61, 30738.1] + - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] + - [79, 33456.2] + - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] + - [60, 36115.7] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [59, 37885.0] + - - [3072, 1024, 1, 64, 3104, 3104, 3072, 1024] + - [64, 16731.2] + - - [3072, 1024, 1, 128, 3104, 3104, 3072, 1024] + - [71, 24119.7] + - - [3072, 1024, 1, 256, 3104, 3104, 3072, 1024] + - [79, 30248.5] + - - [3072, 1024, 1, 512, 3104, 3104, 3072, 1024] + - [47, 33607.0] + - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] + - [8, 36806.5] + - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] + - [8, 38860.1] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [9, 39798.1] + - - [3072, 2048, 1, 64, 3104, 3104, 3072, 2048] + - [75, 21318.0] + - - [3072, 2048, 1, 128, 3104, 3104, 3072, 2048] + - [77, 28415.9] + - - [3072, 2048, 1, 256, 3104, 3104, 3072, 2048] + - [39, 32521.9] + - - [3072, 2048, 1, 512, 3104, 3104, 3072, 2048] + - [26, 36773.7] + - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 2048] + - [40, 39002.4] + - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] + - [60, 39209.6] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [26, 39950.9] + - - [3072, 4096, 1, 64, 3104, 3104, 3072, 4096] + - [53, 6862.26] + - - [3072, 4096, 1, 128, 3104, 3104, 3072, 4096] + - [53, 12876.3] + - - [3072, 4096, 1, 256, 3104, 3104, 3072, 4096] + - [53, 23076.2] + - - [3072, 4096, 1, 512, 3104, 3104, 3072, 4096] + - [40, 34836.5] + - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 4096] + - [25, 38469.0] + - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 4096] + - [39, 40068.0] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [26, 40246.6] + - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] + - [43, 2248.66] + - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] + - [0, 4086.52] + - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] + - [77, 6752.09] + - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] + - [77, 10108.3] + - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] + - [6, 13353.0] + - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] + - [6, 15962.6] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [45, 16985.0] + - - [4096, 128, 1, 64, 4128, 4128, 4096, 128] + - [35, 5693.95] + - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] + - [1, 8636.95] + - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] + - [26, 14180.4] + - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] + - [61, 21198.4] + - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] + - [47, 28723.5] + - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] + - [8, 33544.0] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [8, 34958.8] + - - [4096, 256, 1, 64, 4128, 4128, 4096, 256] + - [77, 9707.63] + - - [4096, 256, 1, 128, 4128, 4128, 4096, 256] + - [50, 15822.0] + - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] + - [61, 22295.3] + - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] + - [25, 28417.9] + - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] + - [8, 33036.2] + - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] + - [26, 35523.8] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [9, 38749.7] + - - [4096, 512, 1, 64, 4128, 4128, 4096, 512] + - [70, 13761.7] + - - [4096, 512, 1, 128, 4128, 4128, 4096, 512] + - [50, 20922.5] + - - [4096, 512, 1, 256, 4128, 4128, 4096, 512] + - [61, 27401.2] + - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] + - [59, 32510.1] + - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] + - [39, 35154.5] + - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] + - [8, 37819.2] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [27, 39756.8] + - - [4096, 1024, 1, 64, 4128, 4128, 4096, 1024] + - [75, 18664.7] + - - [4096, 1024, 1, 128, 4128, 4128, 4096, 1024] + - [71, 26076.9] + - - [4096, 1024, 1, 256, 4128, 4128, 4096, 1024] + - [73, 31845.8] + - - [4096, 1024, 1, 512, 4128, 4128, 4096, 1024] + - [46, 35125.2] + - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] + - [8, 37834.5] + - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] + - [39, 39550.7] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [26, 40028.1] + - - [4096, 2048, 1, 64, 4128, 4128, 4096, 2048] + - [76, 22394.8] + - - [4096, 2048, 1, 128, 4128, 4128, 4096, 2048] + - [73, 29770.8] + - - [4096, 2048, 1, 256, 4128, 4128, 4096, 2048] + - [46, 33621.7] + - - [4096, 2048, 1, 512, 4128, 4128, 4096, 2048] + - [46, 37010.3] + - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 2048] + - [47, 38908.1] + - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] + - [26, 39496.0] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [8, 39951.5] + - - [4096, 4096, 1, 64, 4128, 4128, 4096, 4096] + - [58, 7088.68] + - - [4096, 4096, 1, 128, 4128, 4128, 4096, 4096] + - [52, 13399.6] + - - [4096, 4096, 1, 256, 4128, 4128, 4096, 4096] + - [65, 24382.5] + - - [4096, 4096, 1, 512, 4128, 4128, 4096, 4096] + - [38, 36186.0] + - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 4096] + - [40, 38939.5] + - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 4096] + - [46, 39122.0] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [26, 40501.1] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_SB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_SB.yaml new file mode 100644 index 00000000000..97f2f7b2a57 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bjlk_SB.yaml @@ -0,0 +1,310 @@ +- {MinimumRequiredVersion: 4.33.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 32 + LVCB: 32 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 512 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_ + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWforTLUandMI: false +- [2, 3, 0, 1] +- - - [126, 126, 2, 66, 126, 126, 126, 126] + - [0, 0] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_BBS_BH.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_BBS_BH.yaml new file mode 100644 index 00000000000..8fbf63edecf --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_BBS_BH.yaml @@ -0,0 +1,14343 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [2, 31.5455] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [18, 58.1573] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [2, 92.6138] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [41, 148.408] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [49, 214.872] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [41, 275.406] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [41, 308.15] + - - [64, 128, 1, 64, 96, 96, 96, 96] + - [5, 55.8823] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [31, 103.635] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [22, 179.998] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [41, 290.042] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [41, 422.579] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [41, 545.885] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [41, 615.982] + - - [64, 256, 1, 64, 96, 96, 96, 96] + - [5, 128.96] + - - [64, 256, 1, 128, 96, 96, 160, 160] + - [21, 235.053] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [22, 405.128] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [41, 646.222] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [44, 910.517] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [49, 1146.1] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [44, 1274.98] + - - [64, 512, 1, 64, 96, 96, 96, 96] + - [31, 288.268] + - - [64, 512, 1, 128, 96, 96, 160, 160] + - [34, 526.46] + - - [64, 512, 1, 256, 96, 96, 288, 288] + - [44, 896.506] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [41, 1400.79] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [39, 1890.07] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [44, 2351.15] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [44, 2575.12] + - - [64, 1024, 1, 64, 96, 96, 96, 96] + - [34, 571.429] + - - [64, 1024, 1, 128, 96, 96, 160, 160] + - [41, 1047.92] + - - [64, 1024, 1, 256, 96, 96, 288, 288] + - [39, 1793.01] + - - [64, 1024, 1, 512, 96, 96, 544, 544] + - [41, 2791.32] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [42, 3852.4] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [39, 4840.68] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [41, 5274.3] + - - [64, 2048, 1, 64, 96, 96, 96, 96] + - [9, 1141.0] + - - [64, 2048, 1, 128, 96, 96, 160, 160] + - [41, 2085.94] + - - [64, 2048, 1, 256, 96, 96, 288, 288] + - [44, 3573.04] + - - [64, 2048, 1, 512, 96, 96, 544, 544] + - [41, 5542.98] + - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] + - [44, 7771.73] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [49, 9778.7] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [46, 10868.9] + - - [64, 4096, 1, 64, 96, 96, 96, 96] + - [34, 2055.78] + - - [64, 4096, 1, 128, 96, 96, 160, 160] + - [33, 3798.32] + - - [64, 4096, 1, 256, 96, 96, 288, 288] + - [33, 6475.81] + - - [64, 4096, 1, 512, 96, 96, 544, 544] + - [33, 10103.7] + - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] + - [7, 14055.7] + - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] + - [33, 17325.1] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [34, 18171.0] + - - [128, 64, 1, 64, 160, 160, 128, 96] + - [5, 63.5655] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [5, 118.698] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [2, 188.677] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [49, 297.025] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [43, 425.537] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [49, 549.424] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [41, 614.55] + - - [128, 128, 1, 64, 160, 160, 128, 96] + - [9, 138.958] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [9, 263.793] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [9, 450.129] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [49, 686.129] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [41, 945.993] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [48, 1168.41] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [49, 1286.4] + - - [128, 256, 1, 64, 160, 160, 128, 96] + - [22, 340.501] + - - [128, 256, 1, 128, 160, 160, 160, 160] + - [9, 610.791] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [9, 1016.19] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [41, 1489.98] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [49, 2000.02] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [43, 2422.62] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [49, 2606.32] + - - [128, 512, 1, 64, 160, 160, 128, 96] + - [28, 726.035] + - - [128, 512, 1, 128, 160, 160, 160, 160] + - [9, 1289.36] + - - [128, 512, 1, 256, 160, 160, 288, 288] + - [44, 2142.41] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [44, 3157.77] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [41, 4194.3] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [39, 5001.78] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [49, 5274.29] + - - [128, 1024, 1, 64, 160, 160, 128, 96] + - [44, 1468.59] + - - [128, 1024, 1, 128, 160, 160, 160, 160] + - [42, 2612.86] + - - [128, 1024, 1, 256, 160, 160, 288, 288] + - [41, 4346.43] + - - [128, 1024, 1, 512, 160, 160, 544, 544] + - [47, 6454.02] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [41, 8498.02] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [49, 10038.3] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [40, 10967.3] + - - [128, 2048, 1, 64, 160, 160, 128, 96] + - [44, 2697.74] + - - [128, 2048, 1, 128, 160, 160, 160, 160] + - [41, 4830.74] + - - [128, 2048, 1, 256, 160, 160, 288, 288] + - [44, 8108.85] + - - [128, 2048, 1, 512, 160, 160, 544, 544] + - [47, 12237.2] + - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] + - [41, 16675.1] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [41, 20214.3] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [38, 19144.9] + - - [128, 4096, 1, 64, 160, 160, 128, 96] + - [45, 4424.36] + - - [128, 4096, 1, 128, 160, 160, 160, 160] + - [32, 8058.22] + - - [128, 4096, 1, 256, 160, 160, 288, 288] + - [21, 13687.3] + - - [128, 4096, 1, 512, 160, 160, 544, 544] + - [33, 21218.5] + - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] + - [9, 29324.4] + - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] + - [33, 36226.1] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [22, 34507.1] + - - [256, 64, 1, 64, 288, 288, 256, 96] + - [14, 141.604] + - - [256, 64, 1, 128, 288, 288, 256, 160] + - [5, 263.394] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [1, 421.537] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [41, 643.445] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [41, 904.968] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [44, 1138.48] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [41, 1262.65] + - - [256, 128, 1, 64, 288, 288, 256, 96] + - [9, 322.589] + - - [256, 128, 1, 128, 288, 288, 256, 160] + - [22, 613.023] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [41, 1013.61] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [8, 1491.04] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [48, 1980.55] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [44, 2414.42] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [49, 2587.13] + - - [256, 256, 1, 64, 288, 288, 256, 96] + - [28, 687.929] + - - [256, 256, 1, 128, 288, 288, 256, 160] + - [9, 1236.71] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [41, 2041.52] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [44, 3068.25] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [49, 4176.55] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [49, 4988.02] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [44, 5248.0] + - - [256, 512, 1, 64, 288, 288, 256, 96] + - [22, 1408.43] + - - [256, 512, 1, 128, 288, 288, 256, 160] + - [41, 2521.37] + - - [256, 512, 1, 256, 288, 288, 288, 288] + - [44, 4170.32] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [49, 6216.09] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [44, 8264.64] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [49, 9917.08] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [49, 10609.7] + - - [256, 1024, 1, 64, 288, 288, 256, 96] + - [49, 2777.22] + - - [256, 1024, 1, 128, 288, 288, 256, 160] + - [49, 4954.88] + - - [256, 1024, 1, 256, 288, 288, 288, 288] + - [49, 8125.54] + - - [256, 1024, 1, 512, 288, 288, 544, 544] + - [44, 12225.0] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [41, 16373.0] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [49, 19692.3] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [48, 21657.6] + - - [256, 2048, 1, 64, 288, 288, 256, 96] + - [33, 4637.79] + - - [256, 2048, 1, 128, 288, 288, 256, 160] + - [21, 8376.04] + - - [256, 2048, 1, 256, 288, 288, 288, 288] + - [21, 13583.4] + - - [256, 2048, 1, 512, 288, 288, 544, 544] + - [21, 20875.3] + - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] + - [9, 28780.5] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [33, 35400.8] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [21, 38470.2] + - - [256, 4096, 1, 64, 288, 288, 256, 96] + - [49, 7849.91] + - - [256, 4096, 1, 128, 288, 288, 256, 160] + - [3, 14079.3] + - - [256, 4096, 1, 256, 288, 288, 288, 288] + - [23, 20854.2] + - - [256, 4096, 1, 512, 288, 288, 544, 544] + - [27, 27932.9] + - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] + - [18, 33924.4] + - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] + - [28, 36534.2] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [26, 36580.9] + - - [384, 64, 1, 64, 416, 416, 384, 96] + - [5, 218.271] + - - [384, 64, 1, 128, 416, 416, 384, 160] + - [22, 404.231] + - - [384, 64, 1, 256, 416, 416, 384, 288] + - [5, 657.346] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [38, 983.424] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [43, 1377.14] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [44, 1729.85] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [44, 1904.52] + - - [384, 128, 1, 64, 416, 416, 384, 96] + - [5, 496.172] + - - [384, 128, 1, 128, 416, 416, 384, 160] + - [9, 943.954] + - - [384, 128, 1, 256, 416, 416, 384, 288] + - [9, 1554.6] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [41, 2252.18] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [44, 3023.83] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [41, 3653.97] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [49, 3913.43] + - - [384, 256, 1, 64, 416, 416, 384, 96] + - [9, 1050.15] + - - [384, 256, 1, 128, 416, 416, 384, 160] + - [44, 1875.25] + - - [384, 256, 1, 256, 416, 416, 384, 288] + - [47, 3101.14] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [44, 4620.97] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [49, 6264.82] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [41, 7468.99] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [41, 7863.55] + - - [384, 512, 1, 64, 416, 416, 384, 96] + - [49, 2025.57] + - - [384, 512, 1, 128, 416, 416, 384, 160] + - [22, 3645.09] + - - [384, 512, 1, 256, 416, 416, 384, 288] + - [41, 6053.11] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [39, 9035.39] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [41, 12173.6] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [40, 14626.0] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [41, 15811.7] + - - [384, 1024, 1, 64, 416, 416, 384, 96] + - [21, 3646.69] + - - [384, 1024, 1, 128, 416, 416, 384, 160] + - [33, 6572.41] + - - [384, 1024, 1, 256, 416, 416, 384, 288] + - [21, 10867.2] + - - [384, 1024, 1, 512, 416, 416, 544, 544] + - [33, 16249.1] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [8, 21993.3] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [21, 26658.7] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [33, 29005.9] + - - [384, 2048, 1, 64, 416, 416, 384, 96] + - [12, 6183.99] + - - [384, 2048, 1, 128, 416, 416, 384, 160] + - [28, 10738.6] + - - [384, 2048, 1, 256, 416, 416, 384, 288] + - [2, 18176.8] + - - [384, 2048, 1, 512, 416, 416, 544, 544] + - [1, 25557.1] + - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] + - [4, 30740.4] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [24, 32902.6] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [35, 35810.1] + - - [384, 4096, 1, 64, 416, 416, 384, 96] + - [28, 10343.5] + - - [384, 4096, 1, 128, 416, 416, 384, 160] + - [31, 16763.2] + - - [384, 4096, 1, 256, 416, 416, 384, 288] + - [27, 23679.9] + - - [384, 4096, 1, 512, 416, 416, 544, 544] + - [31, 30242.8] + - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] + - [28, 34131.8] + - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] + - [2, 37668.5] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [28, 37185.0] + - - [768, 64, 1, 64, 800, 800, 768, 96] + - [2, 456.432] + - - [768, 64, 1, 128, 800, 800, 768, 160] + - [44, 815.695] + - - [768, 64, 1, 256, 800, 800, 768, 288] + - [22, 1358.85] + - - [768, 64, 1, 512, 800, 800, 768, 544] + - [41, 2109.63] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [49, 2877.73] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [44, 3511.47] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [41, 3828.74] + - - [768, 128, 1, 64, 800, 800, 768, 96] + - [7, 1033.08] + - - [768, 128, 1, 128, 800, 800, 768, 160] + - [7, 1967.92] + - - [768, 128, 1, 256, 800, 800, 768, 288] + - [41, 3228.87] + - - [768, 128, 1, 512, 800, 800, 768, 544] + - [41, 4624.37] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [47, 6076.87] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [44, 7326.56] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [49, 7814.26] + - - [768, 256, 1, 64, 800, 800, 768, 96] + - [22, 2032.12] + - - [768, 256, 1, 128, 800, 800, 768, 160] + - [49, 3641.94] + - - [768, 256, 1, 256, 800, 800, 768, 288] + - [49, 6055.28] + - - [768, 256, 1, 512, 800, 800, 768, 544] + - [41, 9016.78] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [41, 12240.2] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [41, 14694.3] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [41, 15627.0] + - - [768, 512, 1, 64, 800, 800, 768, 96] + - [21, 3648.28] + - - [768, 512, 1, 128, 800, 800, 768, 160] + - [8, 6605.18] + - - [768, 512, 1, 256, 800, 800, 768, 288] + - [6, 10909.6] + - - [768, 512, 1, 512, 800, 800, 768, 544] + - [8, 16386.7] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [33, 21931.0] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [8, 26898.2] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [33, 29091.3] + - - [768, 1024, 1, 64, 800, 800, 768, 96] + - [3, 6200.01] + - - [768, 1024, 1, 128, 800, 800, 768, 160] + - [5, 10852.0] + - - [768, 1024, 1, 256, 800, 800, 768, 288] + - [2, 18448.3] + - - [768, 1024, 1, 512, 800, 800, 768, 544] + - [5, 25720.4] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [31, 30377.5] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [10, 32716.7] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [11, 35693.8] + - - [768, 2048, 1, 64, 800, 800, 768, 96] + - [14, 10831.0] + - - [768, 2048, 1, 128, 800, 800, 768, 160] + - [14, 17308.0] + - - [768, 2048, 1, 256, 800, 800, 768, 288] + - [18, 24651.2] + - - [768, 2048, 1, 512, 800, 800, 768, 544] + - [2, 31447.4] + - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] + - [31, 34654.7] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [18, 38424.7] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [28, 40266.6] + - - [768, 4096, 1, 64, 800, 800, 768, 96] + - [28, 13611.4] + - - [768, 4096, 1, 128, 800, 800, 768, 160] + - [18, 20825.1] + - - [768, 4096, 1, 256, 800, 800, 768, 288] + - [5, 28104.5] + - - [768, 4096, 1, 512, 800, 800, 768, 544] + - [28, 33066.7] + - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] + - [2, 37666.3] + - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] + - [18, 40570.6] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [14, 40231.5] + - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] + - [2, 966.578] + - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] + - [0, 1768.5] + - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] + - [20, 2846.17] + - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] + - [44, 4206.92] + - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] + - [49, 5748.9] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [49, 7017.79] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [49, 7271.78] + - - [1536, 128, 1, 64, 1568, 1568, 1536, 96] + - [7, 1920.18] + - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] + - [9, 3648.81] + - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] + - [49, 6017.65] + - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] + - [49, 9205.61] + - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] + - [44, 12195.0] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [49, 14671.3] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [44, 13850.9] + - - [1536, 256, 1, 64, 1568, 1568, 1536, 96] + - [21, 3677.6] + - - [1536, 256, 1, 128, 1568, 1568, 1536, 160] + - [21, 6618.23] + - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] + - [33, 11015.9] + - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] + - [8, 16499.4] + - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] + - [9, 22029.4] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [8, 26694.9] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [9, 28943.4] + - - [1536, 512, 1, 64, 1568, 1568, 1536, 96] + - [24, 5967.0] + - - [1536, 512, 1, 128, 1568, 1568, 1536, 160] + - [5, 10440.1] + - - [1536, 512, 1, 256, 1568, 1568, 1536, 288] + - [5, 17975.6] + - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] + - [1, 25368.8] + - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] + - [30, 30814.5] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [10, 32993.5] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [26, 35643.3] + - - [1536, 1024, 1, 64, 1568, 1568, 1536, 96] + - [30, 10536.2] + - - [1536, 1024, 1, 128, 1568, 1568, 1536, 160] + - [30, 16841.8] + - - [1536, 1024, 1, 256, 1568, 1568, 1536, 288] + - [28, 24297.2] + - - [1536, 1024, 1, 512, 1568, 1568, 1536, 544] + - [18, 31188.0] + - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] + - [28, 34977.6] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [16, 38439.9] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [17, 39310.3] + - - [1536, 2048, 1, 64, 1568, 1568, 1536, 96] + - [31, 13682.7] + - - [1536, 2048, 1, 128, 1568, 1568, 1536, 160] + - [28, 20853.1] + - - [1536, 2048, 1, 256, 1568, 1568, 1536, 288] + - [14, 28171.3] + - - [1536, 2048, 1, 512, 1568, 1568, 1536, 544] + - [14, 33011.1] + - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 1056] + - [31, 37838.4] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [14, 40624.1] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [13, 40569.6] + - - [1536, 4096, 1, 64, 1568, 1568, 1536, 96] + - [31, 16107.4] + - - [1536, 4096, 1, 128, 1568, 1568, 1536, 160] + - [28, 23625.0] + - - [1536, 4096, 1, 256, 1568, 1568, 1536, 288] + - [14, 29528.1] + - - [1536, 4096, 1, 512, 1568, 1568, 1536, 544] + - [13, 35602.3] + - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 1056] + - [28, 39645.1] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] + - [28, 41199.1] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [18, 40470.4] + - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] + - [19, 1581.17] + - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] + - [21, 3002.72] + - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] + - [21, 5094.8] + - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] + - [9, 7754.66] + - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] + - [33, 10734.0] + - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] + - [8, 13216.0] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [21, 14478.2] + - - [3072, 128, 1, 64, 3104, 3104, 3072, 96] + - [8, 4118.79] + - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] + - [21, 6933.67] + - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] + - [8, 11427.3] + - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] + - [21, 17050.0] + - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] + - [33, 23033.8] + - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] + - [8, 27546.0] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [8, 29441.2] + - - [3072, 256, 1, 64, 3104, 3104, 3072, 96] + - [22, 7468.71] + - - [3072, 256, 1, 128, 3104, 3104, 3072, 160] + - [31, 12682.8] + - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] + - [5, 19388.2] + - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] + - [5, 25789.6] + - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] + - [30, 30932.9] + - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] + - [37, 32913.3] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [10, 36545.9] + - - [3072, 512, 1, 64, 3104, 3104, 3072, 96] + - [30, 10490.1] + - - [3072, 512, 1, 128, 3104, 3104, 3072, 160] + - [14, 16913.9] + - - [3072, 512, 1, 256, 3104, 3104, 3072, 288] + - [28, 24281.1] + - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] + - [18, 31168.7] + - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] + - [14, 34598.1] + - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] + - [16, 38508.8] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [2, 37806.2] + - - [3072, 1024, 1, 64, 3104, 3104, 3072, 96] + - [28, 13658.5] + - - [3072, 1024, 1, 128, 3104, 3104, 3072, 160] + - [31, 20927.9] + - - [3072, 1024, 1, 256, 3104, 3104, 3072, 288] + - [31, 28140.8] + - - [3072, 1024, 1, 512, 3104, 3104, 3072, 544] + - [14, 32921.4] + - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] + - [31, 37670.7] + - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] + - [13, 40662.0] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [28, 39736.3] + - - [3072, 2048, 1, 64, 3104, 3104, 3072, 96] + - [31, 16083.6] + - - [3072, 2048, 1, 128, 3104, 3104, 3072, 160] + - [18, 23636.1] + - - [3072, 2048, 1, 256, 3104, 3104, 3072, 288] + - [2, 29682.1] + - - [3072, 2048, 1, 512, 3104, 3104, 3072, 544] + - [17, 35629.9] + - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 1056] + - [14, 39631.9] + - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] + - [14, 41348.8] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [31, 40294.3] + - - [3072, 4096, 1, 64, 3104, 3104, 3072, 96] + - [31, 16959.2] + - - [3072, 4096, 1, 128, 3104, 3104, 3072, 160] + - [31, 24553.1] + - - [3072, 4096, 1, 256, 3104, 3104, 3072, 288] + - [5, 31740.6] + - - [3072, 4096, 1, 512, 3104, 3104, 3072, 544] + - [17, 37183.5] + - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 1056] + - [31, 39984.0] + - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 2080] + - [14, 41333.5] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [23, 38372.4] + - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] + - [21, 2211.02] + - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] + - [32, 4254.94] + - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] + - [19, 7136.96] + - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] + - [21, 10914.7] + - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] + - [21, 14685.5] + - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] + - [8, 17868.9] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [9, 18919.9] + - - [4096, 128, 1, 64, 4128, 4128, 4096, 96] + - [19, 5265.92] + - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] + - [21, 9489.38] + - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] + - [33, 15606.7] + - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] + - [33, 23017.9] + - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] + - [22, 30352.3] + - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] + - [8, 36329.1] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [21, 37923.3] + - - [4096, 256, 1, 64, 4128, 4128, 4096, 96] + - [33, 8889.77] + - - [4096, 256, 1, 128, 4128, 4128, 4096, 160] + - [18, 14731.4] + - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] + - [28, 21226.9] + - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] + - [14, 28508.4] + - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] + - [28, 34183.6] + - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] + - [29, 36300.2] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [11, 37577.9] + - - [4096, 512, 1, 64, 4128, 4128, 4096, 96] + - [36, 11821.2] + - - [4096, 512, 1, 128, 4128, 4128, 4096, 160] + - [31, 18642.6] + - - [4096, 512, 1, 256, 4128, 4128, 4096, 288] + - [31, 25960.9] + - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] + - [18, 32563.3] + - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] + - [31, 35803.3] + - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] + - [31, 39464.9] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [25, 39013.4] + - - [4096, 1024, 1, 64, 4128, 4128, 4096, 96] + - [30, 14741.9] + - - [4096, 1024, 1, 128, 4128, 4128, 4096, 160] + - [28, 22210.4] + - - [4096, 1024, 1, 256, 4128, 4128, 4096, 288] + - [14, 29142.9] + - - [4096, 1024, 1, 512, 4128, 4128, 4096, 544] + - [18, 34197.8] + - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] + - [18, 38624.5] + - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] + - [18, 41330.2] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [30, 40005.7] + - - [4096, 2048, 1, 64, 4128, 4128, 4096, 96] + - [14, 16793.0] + - - [4096, 2048, 1, 128, 4128, 4128, 4096, 160] + - [14, 23383.9] + - - [4096, 2048, 1, 256, 4128, 4128, 4096, 288] + - [2, 30709.5] + - - [4096, 2048, 1, 512, 4128, 4128, 4096, 544] + - [13, 36397.4] + - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 1056] + - [28, 40230.9] + - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] + - [13, 41088.7] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [28, 40108.3] + - - [4096, 4096, 1, 64, 4128, 4128, 4096, 96] + - [34, 17263.3] + - - [4096, 4096, 1, 128, 4128, 4128, 4096, 160] + - [14, 24821.2] + - - [4096, 4096, 1, 256, 4128, 4128, 4096, 288] + - [5, 31872.9] + - - [4096, 4096, 1, 512, 4128, 4128, 4096, 544] + - [17, 37287.4] + - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 1056] + - [13, 39593.2] + - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 2080] + - [15, 41369.1] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [17, 36883.9] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_BBS_BH_GB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_BBS_BH_GB.yaml new file mode 100644 index 00000000000..8e220f01498 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_BBS_BH_GB.yaml @@ -0,0 +1,14343 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [2, 31.5455] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [18, 58.1573] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [2, 92.6138] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [41, 148.408] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [49, 214.872] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [41, 275.406] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [41, 308.15] + - - [64, 128, 1, 64, 96, 96, 96, 96] + - [5, 55.8823] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [31, 103.635] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [22, 179.998] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [41, 290.042] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [41, 422.579] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [41, 545.885] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [41, 615.982] + - - [64, 256, 1, 64, 96, 96, 96, 96] + - [5, 128.96] + - - [64, 256, 1, 128, 96, 96, 160, 160] + - [21, 235.053] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [22, 405.128] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [41, 646.222] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [44, 910.517] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [49, 1146.1] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [44, 1274.98] + - - [64, 512, 1, 64, 96, 96, 96, 96] + - [31, 288.268] + - - [64, 512, 1, 128, 96, 96, 160, 160] + - [34, 526.46] + - - [64, 512, 1, 256, 96, 96, 288, 288] + - [44, 896.506] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [41, 1400.79] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [39, 1890.07] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [44, 2351.15] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [44, 2575.12] + - - [64, 1024, 1, 64, 96, 96, 96, 96] + - [34, 571.429] + - - [64, 1024, 1, 128, 96, 96, 160, 160] + - [41, 1047.92] + - - [64, 1024, 1, 256, 96, 96, 288, 288] + - [39, 1793.01] + - - [64, 1024, 1, 512, 96, 96, 544, 544] + - [41, 2791.32] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [42, 3852.4] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [39, 4840.68] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [41, 5274.3] + - - [64, 2048, 1, 64, 96, 96, 96, 96] + - [9, 1141.0] + - - [64, 2048, 1, 128, 96, 96, 160, 160] + - [41, 2085.94] + - - [64, 2048, 1, 256, 96, 96, 288, 288] + - [44, 3573.04] + - - [64, 2048, 1, 512, 96, 96, 544, 544] + - [41, 5542.98] + - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] + - [44, 7771.73] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [49, 9778.7] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [46, 10868.9] + - - [64, 4096, 1, 64, 96, 96, 96, 96] + - [34, 2055.78] + - - [64, 4096, 1, 128, 96, 96, 160, 160] + - [33, 3798.32] + - - [64, 4096, 1, 256, 96, 96, 288, 288] + - [33, 6475.81] + - - [64, 4096, 1, 512, 96, 96, 544, 544] + - [33, 10103.7] + - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] + - [7, 14055.7] + - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] + - [33, 17325.1] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [34, 18171.0] + - - [128, 64, 1, 64, 160, 160, 128, 96] + - [5, 63.5655] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [5, 118.698] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [2, 188.677] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [49, 297.025] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [43, 425.537] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [49, 549.424] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [41, 614.55] + - - [128, 128, 1, 64, 160, 160, 128, 96] + - [9, 138.958] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [9, 263.793] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [9, 450.129] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [49, 686.129] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [41, 945.993] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [48, 1168.41] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [49, 1286.4] + - - [128, 256, 1, 64, 160, 160, 128, 96] + - [22, 340.501] + - - [128, 256, 1, 128, 160, 160, 160, 160] + - [9, 610.791] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [9, 1016.19] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [41, 1489.98] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [49, 2000.02] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [43, 2422.62] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [49, 2606.32] + - - [128, 512, 1, 64, 160, 160, 128, 96] + - [28, 726.035] + - - [128, 512, 1, 128, 160, 160, 160, 160] + - [9, 1289.36] + - - [128, 512, 1, 256, 160, 160, 288, 288] + - [44, 2142.41] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [44, 3157.77] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [41, 4194.3] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [39, 5001.78] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [49, 5274.29] + - - [128, 1024, 1, 64, 160, 160, 128, 96] + - [44, 1468.59] + - - [128, 1024, 1, 128, 160, 160, 160, 160] + - [42, 2612.86] + - - [128, 1024, 1, 256, 160, 160, 288, 288] + - [41, 4346.43] + - - [128, 1024, 1, 512, 160, 160, 544, 544] + - [47, 6454.02] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [41, 8498.02] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [49, 10038.3] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [40, 10967.3] + - - [128, 2048, 1, 64, 160, 160, 128, 96] + - [44, 2697.74] + - - [128, 2048, 1, 128, 160, 160, 160, 160] + - [41, 4830.74] + - - [128, 2048, 1, 256, 160, 160, 288, 288] + - [44, 8108.85] + - - [128, 2048, 1, 512, 160, 160, 544, 544] + - [47, 12237.2] + - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] + - [41, 16675.1] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [41, 20214.3] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [38, 19144.9] + - - [128, 4096, 1, 64, 160, 160, 128, 96] + - [45, 4424.36] + - - [128, 4096, 1, 128, 160, 160, 160, 160] + - [32, 8058.22] + - - [128, 4096, 1, 256, 160, 160, 288, 288] + - [21, 13687.3] + - - [128, 4096, 1, 512, 160, 160, 544, 544] + - [33, 21218.5] + - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] + - [9, 29324.4] + - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] + - [33, 36226.1] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [22, 34507.1] + - - [256, 64, 1, 64, 288, 288, 256, 96] + - [14, 141.604] + - - [256, 64, 1, 128, 288, 288, 256, 160] + - [5, 263.394] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [1, 421.537] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [41, 643.445] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [41, 904.968] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [44, 1138.48] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [41, 1262.65] + - - [256, 128, 1, 64, 288, 288, 256, 96] + - [9, 322.589] + - - [256, 128, 1, 128, 288, 288, 256, 160] + - [22, 613.023] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [41, 1013.61] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [8, 1491.04] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [48, 1980.55] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [44, 2414.42] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [49, 2587.13] + - - [256, 256, 1, 64, 288, 288, 256, 96] + - [28, 687.929] + - - [256, 256, 1, 128, 288, 288, 256, 160] + - [9, 1236.71] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [41, 2041.52] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [44, 3068.25] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [49, 4176.55] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [49, 4988.02] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [44, 5248.0] + - - [256, 512, 1, 64, 288, 288, 256, 96] + - [22, 1408.43] + - - [256, 512, 1, 128, 288, 288, 256, 160] + - [41, 2521.37] + - - [256, 512, 1, 256, 288, 288, 288, 288] + - [44, 4170.32] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [49, 6216.09] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [44, 8264.64] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [49, 9917.08] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [49, 10609.7] + - - [256, 1024, 1, 64, 288, 288, 256, 96] + - [49, 2777.22] + - - [256, 1024, 1, 128, 288, 288, 256, 160] + - [49, 4954.88] + - - [256, 1024, 1, 256, 288, 288, 288, 288] + - [49, 8125.54] + - - [256, 1024, 1, 512, 288, 288, 544, 544] + - [44, 12225.0] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [41, 16373.0] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [49, 19692.3] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [48, 21657.6] + - - [256, 2048, 1, 64, 288, 288, 256, 96] + - [33, 4637.79] + - - [256, 2048, 1, 128, 288, 288, 256, 160] + - [21, 8376.04] + - - [256, 2048, 1, 256, 288, 288, 288, 288] + - [21, 13583.4] + - - [256, 2048, 1, 512, 288, 288, 544, 544] + - [21, 20875.3] + - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] + - [9, 28780.5] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [33, 35400.8] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [21, 38470.2] + - - [256, 4096, 1, 64, 288, 288, 256, 96] + - [49, 7849.91] + - - [256, 4096, 1, 128, 288, 288, 256, 160] + - [3, 14079.3] + - - [256, 4096, 1, 256, 288, 288, 288, 288] + - [23, 20854.2] + - - [256, 4096, 1, 512, 288, 288, 544, 544] + - [27, 27932.9] + - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] + - [18, 33924.4] + - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] + - [28, 36534.2] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [26, 36580.9] + - - [384, 64, 1, 64, 416, 416, 384, 96] + - [5, 218.271] + - - [384, 64, 1, 128, 416, 416, 384, 160] + - [22, 404.231] + - - [384, 64, 1, 256, 416, 416, 384, 288] + - [5, 657.346] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [38, 983.424] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [43, 1377.14] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [44, 1729.85] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [44, 1904.52] + - - [384, 128, 1, 64, 416, 416, 384, 96] + - [5, 496.172] + - - [384, 128, 1, 128, 416, 416, 384, 160] + - [9, 943.954] + - - [384, 128, 1, 256, 416, 416, 384, 288] + - [9, 1554.6] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [41, 2252.18] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [44, 3023.83] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [41, 3653.97] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [49, 3913.43] + - - [384, 256, 1, 64, 416, 416, 384, 96] + - [9, 1050.15] + - - [384, 256, 1, 128, 416, 416, 384, 160] + - [44, 1875.25] + - - [384, 256, 1, 256, 416, 416, 384, 288] + - [47, 3101.14] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [44, 4620.97] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [49, 6264.82] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [41, 7468.99] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [41, 7863.55] + - - [384, 512, 1, 64, 416, 416, 384, 96] + - [49, 2025.57] + - - [384, 512, 1, 128, 416, 416, 384, 160] + - [22, 3645.09] + - - [384, 512, 1, 256, 416, 416, 384, 288] + - [41, 6053.11] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [39, 9035.39] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [41, 12173.6] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [40, 14626.0] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [41, 15811.7] + - - [384, 1024, 1, 64, 416, 416, 384, 96] + - [21, 3646.69] + - - [384, 1024, 1, 128, 416, 416, 384, 160] + - [33, 6572.41] + - - [384, 1024, 1, 256, 416, 416, 384, 288] + - [21, 10867.2] + - - [384, 1024, 1, 512, 416, 416, 544, 544] + - [33, 16249.1] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [8, 21993.3] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [21, 26658.7] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [33, 29005.9] + - - [384, 2048, 1, 64, 416, 416, 384, 96] + - [12, 6183.99] + - - [384, 2048, 1, 128, 416, 416, 384, 160] + - [28, 10738.6] + - - [384, 2048, 1, 256, 416, 416, 384, 288] + - [2, 18176.8] + - - [384, 2048, 1, 512, 416, 416, 544, 544] + - [1, 25557.1] + - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] + - [4, 30740.4] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [24, 32902.6] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [35, 35810.1] + - - [384, 4096, 1, 64, 416, 416, 384, 96] + - [28, 10343.5] + - - [384, 4096, 1, 128, 416, 416, 384, 160] + - [31, 16763.2] + - - [384, 4096, 1, 256, 416, 416, 384, 288] + - [27, 23679.9] + - - [384, 4096, 1, 512, 416, 416, 544, 544] + - [31, 30242.8] + - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] + - [28, 34131.8] + - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] + - [2, 37668.5] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [28, 37185.0] + - - [768, 64, 1, 64, 800, 800, 768, 96] + - [2, 456.432] + - - [768, 64, 1, 128, 800, 800, 768, 160] + - [44, 815.695] + - - [768, 64, 1, 256, 800, 800, 768, 288] + - [22, 1358.85] + - - [768, 64, 1, 512, 800, 800, 768, 544] + - [41, 2109.63] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [49, 2877.73] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [44, 3511.47] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [41, 3828.74] + - - [768, 128, 1, 64, 800, 800, 768, 96] + - [7, 1033.08] + - - [768, 128, 1, 128, 800, 800, 768, 160] + - [7, 1967.92] + - - [768, 128, 1, 256, 800, 800, 768, 288] + - [41, 3228.87] + - - [768, 128, 1, 512, 800, 800, 768, 544] + - [41, 4624.37] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [47, 6076.87] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [44, 7326.56] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [49, 7814.26] + - - [768, 256, 1, 64, 800, 800, 768, 96] + - [22, 2032.12] + - - [768, 256, 1, 128, 800, 800, 768, 160] + - [49, 3641.94] + - - [768, 256, 1, 256, 800, 800, 768, 288] + - [49, 6055.28] + - - [768, 256, 1, 512, 800, 800, 768, 544] + - [41, 9016.78] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [41, 12240.2] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [41, 14694.3] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [41, 15627.0] + - - [768, 512, 1, 64, 800, 800, 768, 96] + - [21, 3648.28] + - - [768, 512, 1, 128, 800, 800, 768, 160] + - [8, 6605.18] + - - [768, 512, 1, 256, 800, 800, 768, 288] + - [6, 10909.6] + - - [768, 512, 1, 512, 800, 800, 768, 544] + - [8, 16386.7] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [33, 21931.0] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [8, 26898.2] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [33, 29091.3] + - - [768, 1024, 1, 64, 800, 800, 768, 96] + - [3, 6200.01] + - - [768, 1024, 1, 128, 800, 800, 768, 160] + - [5, 10852.0] + - - [768, 1024, 1, 256, 800, 800, 768, 288] + - [2, 18448.3] + - - [768, 1024, 1, 512, 800, 800, 768, 544] + - [5, 25720.4] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [31, 30377.5] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [10, 32716.7] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [11, 35693.8] + - - [768, 2048, 1, 64, 800, 800, 768, 96] + - [14, 10831.0] + - - [768, 2048, 1, 128, 800, 800, 768, 160] + - [14, 17308.0] + - - [768, 2048, 1, 256, 800, 800, 768, 288] + - [18, 24651.2] + - - [768, 2048, 1, 512, 800, 800, 768, 544] + - [2, 31447.4] + - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] + - [31, 34654.7] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [18, 38424.7] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [28, 40266.6] + - - [768, 4096, 1, 64, 800, 800, 768, 96] + - [28, 13611.4] + - - [768, 4096, 1, 128, 800, 800, 768, 160] + - [18, 20825.1] + - - [768, 4096, 1, 256, 800, 800, 768, 288] + - [5, 28104.5] + - - [768, 4096, 1, 512, 800, 800, 768, 544] + - [28, 33066.7] + - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] + - [2, 37666.3] + - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] + - [18, 40570.6] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [14, 40231.5] + - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] + - [2, 966.578] + - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] + - [0, 1768.5] + - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] + - [20, 2846.17] + - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] + - [44, 4206.92] + - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] + - [49, 5748.9] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [49, 7017.79] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [49, 7271.78] + - - [1536, 128, 1, 64, 1568, 1568, 1536, 96] + - [7, 1920.18] + - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] + - [9, 3648.81] + - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] + - [49, 6017.65] + - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] + - [49, 9205.61] + - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] + - [44, 12195.0] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [49, 14671.3] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [44, 13850.9] + - - [1536, 256, 1, 64, 1568, 1568, 1536, 96] + - [21, 3677.6] + - - [1536, 256, 1, 128, 1568, 1568, 1536, 160] + - [21, 6618.23] + - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] + - [33, 11015.9] + - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] + - [8, 16499.4] + - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] + - [9, 22029.4] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [8, 26694.9] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [9, 28943.4] + - - [1536, 512, 1, 64, 1568, 1568, 1536, 96] + - [24, 5967.0] + - - [1536, 512, 1, 128, 1568, 1568, 1536, 160] + - [5, 10440.1] + - - [1536, 512, 1, 256, 1568, 1568, 1536, 288] + - [5, 17975.6] + - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] + - [1, 25368.8] + - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] + - [30, 30814.5] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [10, 32993.5] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [26, 35643.3] + - - [1536, 1024, 1, 64, 1568, 1568, 1536, 96] + - [30, 10536.2] + - - [1536, 1024, 1, 128, 1568, 1568, 1536, 160] + - [30, 16841.8] + - - [1536, 1024, 1, 256, 1568, 1568, 1536, 288] + - [28, 24297.2] + - - [1536, 1024, 1, 512, 1568, 1568, 1536, 544] + - [18, 31188.0] + - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] + - [28, 34977.6] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [16, 38439.9] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [17, 39310.3] + - - [1536, 2048, 1, 64, 1568, 1568, 1536, 96] + - [31, 13682.7] + - - [1536, 2048, 1, 128, 1568, 1568, 1536, 160] + - [28, 20853.1] + - - [1536, 2048, 1, 256, 1568, 1568, 1536, 288] + - [14, 28171.3] + - - [1536, 2048, 1, 512, 1568, 1568, 1536, 544] + - [14, 33011.1] + - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 1056] + - [31, 37838.4] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [14, 40624.1] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [13, 40569.6] + - - [1536, 4096, 1, 64, 1568, 1568, 1536, 96] + - [31, 16107.4] + - - [1536, 4096, 1, 128, 1568, 1568, 1536, 160] + - [28, 23625.0] + - - [1536, 4096, 1, 256, 1568, 1568, 1536, 288] + - [14, 29528.1] + - - [1536, 4096, 1, 512, 1568, 1568, 1536, 544] + - [13, 35602.3] + - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 1056] + - [28, 39645.1] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] + - [28, 41199.1] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [18, 40470.4] + - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] + - [19, 1581.17] + - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] + - [21, 3002.72] + - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] + - [21, 5094.8] + - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] + - [9, 7754.66] + - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] + - [33, 10734.0] + - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] + - [8, 13216.0] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [21, 14478.2] + - - [3072, 128, 1, 64, 3104, 3104, 3072, 96] + - [8, 4118.79] + - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] + - [21, 6933.67] + - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] + - [8, 11427.3] + - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] + - [21, 17050.0] + - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] + - [33, 23033.8] + - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] + - [8, 27546.0] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [8, 29441.2] + - - [3072, 256, 1, 64, 3104, 3104, 3072, 96] + - [22, 7468.71] + - - [3072, 256, 1, 128, 3104, 3104, 3072, 160] + - [31, 12682.8] + - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] + - [5, 19388.2] + - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] + - [5, 25789.6] + - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] + - [30, 30932.9] + - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] + - [37, 32913.3] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [10, 36545.9] + - - [3072, 512, 1, 64, 3104, 3104, 3072, 96] + - [30, 10490.1] + - - [3072, 512, 1, 128, 3104, 3104, 3072, 160] + - [14, 16913.9] + - - [3072, 512, 1, 256, 3104, 3104, 3072, 288] + - [28, 24281.1] + - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] + - [18, 31168.7] + - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] + - [14, 34598.1] + - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] + - [16, 38508.8] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [2, 37806.2] + - - [3072, 1024, 1, 64, 3104, 3104, 3072, 96] + - [28, 13658.5] + - - [3072, 1024, 1, 128, 3104, 3104, 3072, 160] + - [31, 20927.9] + - - [3072, 1024, 1, 256, 3104, 3104, 3072, 288] + - [31, 28140.8] + - - [3072, 1024, 1, 512, 3104, 3104, 3072, 544] + - [14, 32921.4] + - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] + - [31, 37670.7] + - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] + - [13, 40662.0] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [28, 39736.3] + - - [3072, 2048, 1, 64, 3104, 3104, 3072, 96] + - [31, 16083.6] + - - [3072, 2048, 1, 128, 3104, 3104, 3072, 160] + - [18, 23636.1] + - - [3072, 2048, 1, 256, 3104, 3104, 3072, 288] + - [2, 29682.1] + - - [3072, 2048, 1, 512, 3104, 3104, 3072, 544] + - [17, 35629.9] + - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 1056] + - [14, 39631.9] + - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] + - [14, 41348.8] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [31, 40294.3] + - - [3072, 4096, 1, 64, 3104, 3104, 3072, 96] + - [31, 16959.2] + - - [3072, 4096, 1, 128, 3104, 3104, 3072, 160] + - [31, 24553.1] + - - [3072, 4096, 1, 256, 3104, 3104, 3072, 288] + - [5, 31740.6] + - - [3072, 4096, 1, 512, 3104, 3104, 3072, 544] + - [17, 37183.5] + - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 1056] + - [31, 39984.0] + - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 2080] + - [14, 41333.5] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [23, 38372.4] + - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] + - [21, 2211.02] + - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] + - [32, 4254.94] + - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] + - [19, 7136.96] + - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] + - [21, 10914.7] + - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] + - [21, 14685.5] + - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] + - [8, 17868.9] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [9, 18919.9] + - - [4096, 128, 1, 64, 4128, 4128, 4096, 96] + - [19, 5265.92] + - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] + - [21, 9489.38] + - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] + - [33, 15606.7] + - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] + - [33, 23017.9] + - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] + - [22, 30352.3] + - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] + - [8, 36329.1] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [21, 37923.3] + - - [4096, 256, 1, 64, 4128, 4128, 4096, 96] + - [33, 8889.77] + - - [4096, 256, 1, 128, 4128, 4128, 4096, 160] + - [18, 14731.4] + - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] + - [28, 21226.9] + - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] + - [14, 28508.4] + - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] + - [28, 34183.6] + - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] + - [29, 36300.2] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [11, 37577.9] + - - [4096, 512, 1, 64, 4128, 4128, 4096, 96] + - [36, 11821.2] + - - [4096, 512, 1, 128, 4128, 4128, 4096, 160] + - [31, 18642.6] + - - [4096, 512, 1, 256, 4128, 4128, 4096, 288] + - [31, 25960.9] + - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] + - [18, 32563.3] + - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] + - [31, 35803.3] + - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] + - [31, 39464.9] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [25, 39013.4] + - - [4096, 1024, 1, 64, 4128, 4128, 4096, 96] + - [30, 14741.9] + - - [4096, 1024, 1, 128, 4128, 4128, 4096, 160] + - [28, 22210.4] + - - [4096, 1024, 1, 256, 4128, 4128, 4096, 288] + - [14, 29142.9] + - - [4096, 1024, 1, 512, 4128, 4128, 4096, 544] + - [18, 34197.8] + - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] + - [18, 38624.5] + - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] + - [18, 41330.2] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [30, 40005.7] + - - [4096, 2048, 1, 64, 4128, 4128, 4096, 96] + - [14, 16793.0] + - - [4096, 2048, 1, 128, 4128, 4128, 4096, 160] + - [14, 23383.9] + - - [4096, 2048, 1, 256, 4128, 4128, 4096, 288] + - [2, 30709.5] + - - [4096, 2048, 1, 512, 4128, 4128, 4096, 544] + - [13, 36397.4] + - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 1056] + - [28, 40230.9] + - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] + - [13, 41088.7] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [28, 40108.3] + - - [4096, 4096, 1, 64, 4128, 4128, 4096, 96] + - [34, 17263.3] + - - [4096, 4096, 1, 128, 4128, 4128, 4096, 160] + - [14, 24821.2] + - - [4096, 4096, 1, 256, 4128, 4128, 4096, 288] + - [5, 31872.9] + - - [4096, 4096, 1, 512, 4128, 4128, 4096, 544] + - [17, 37287.4] + - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 1056] + - [13, 39593.2] + - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 2080] + - [15, 41369.1] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [17, 36883.9] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_HB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_HB.yaml new file mode 100644 index 00000000000..0e856b66990 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_HB.yaml @@ -0,0 +1,22713 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [2, 36.787] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [29, 61.6809] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [71, 105.778] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [71, 165.352] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [71, 228.523] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [77, 285.911] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [71, 316.438] + - - [64, 128, 1, 64, 96, 96, 96, 96] + - [35, 64.9677] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [29, 119.048] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [29, 204.52] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [77, 324.085] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [71, 455.878] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [71, 574.032] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [80, 633.976] + - - [64, 256, 1, 64, 96, 96, 96, 96] + - [29, 154.361] + - - [64, 256, 1, 128, 96, 96, 160, 160] + - [29, 280.48] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [77, 483.549] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [71, 727.546] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [71, 992.327] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [61, 1191.18] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [77, 1301.8] + - - [64, 512, 1, 64, 96, 96, 96, 96] + - [71, 331.568] + - - [64, 512, 1, 128, 96, 96, 160, 160] + - [77, 597.056] + - - [64, 512, 1, 256, 96, 96, 288, 288] + - [74, 1002.7] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [71, 1479.99] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [77, 2015.04] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [77, 2450.3] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [74, 2626.16] + - - [64, 1024, 1, 64, 96, 96, 96, 96] + - [48, 682.001] + - - [64, 1024, 1, 128, 96, 96, 160, 160] + - [77, 1233.08] + - - [64, 1024, 1, 256, 96, 96, 288, 288] + - [80, 2064.63] + - - [64, 1024, 1, 512, 96, 96, 544, 544] + - [79, 3019.93] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [64, 4102.27] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [80, 5002.53] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [62, 5363.78] + - - [64, 2048, 1, 64, 96, 96, 96, 96] + - [70, 1281.5] + - - [64, 2048, 1, 128, 96, 96, 160, 160] + - [66, 2328.88] + - - [64, 2048, 1, 256, 96, 96, 288, 288] + - [76, 3924.04] + - - [64, 2048, 1, 512, 96, 96, 544, 544] + - [60, 6173.2] + - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] + - [60, 8336.5] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [79, 10165.3] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [67, 11151.6] + - - [64, 4096, 1, 64, 96, 96, 96, 96] + - [11, 2210.15] + - - [64, 4096, 1, 128, 96, 96, 160, 160] + - [72, 3995.05] + - - [64, 4096, 1, 256, 96, 96, 288, 288] + - [58, 6519.22] + - - [64, 4096, 1, 512, 96, 96, 544, 544] + - [13, 9791.93] + - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] + - [15, 13509.6] + - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] + - [12, 16861.5] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [68, 14846.1] + - - [128, 64, 1, 64, 160, 160, 128, 96] + - [6, 73.7603] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [2, 128.691] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [8, 204.421] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [77, 322.341] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [80, 451.364] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [74, 575.667] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [71, 630.285] + - - [128, 128, 1, 64, 160, 160, 128, 96] + - [34, 167.772] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [19, 306.154] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [49, 506.314] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [71, 781.355] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [71, 1036.01] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [80, 1222.96] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [71, 1305.11] + - - [128, 256, 1, 64, 160, 160, 128, 96] + - [44, 386.287] + - - [128, 256, 1, 128, 160, 160, 160, 160] + - [10, 679.13] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [49, 1111.22] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [63, 1634.73] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [67, 2098.2] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [67, 2503.6] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [77, 2658.73] + - - [128, 512, 1, 64, 160, 160, 128, 96] + - [67, 798.767] + - - [128, 512, 1, 128, 160, 160, 160, 160] + - [10, 1399.97] + - - [128, 512, 1, 256, 160, 160, 288, 288] + - [77, 2307.73] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [80, 3356.46] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [63, 4361.97] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [74, 5071.52] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [62, 5336.48] + - - [128, 1024, 1, 64, 160, 160, 128, 96] + - [71, 1512.29] + - - [128, 1024, 1, 128, 160, 160, 160, 160] + - [77, 2705.57] + - - [128, 1024, 1, 256, 160, 160, 288, 288] + - [77, 4459.08] + - - [128, 1024, 1, 512, 160, 160, 544, 544] + - [69, 6578.67] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [65, 8665.37] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [65, 10514.5] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [80, 11038.3] + - - [128, 2048, 1, 64, 160, 160, 128, 96] + - [63, 2750.36] + - - [128, 2048, 1, 128, 160, 160, 160, 160] + - [74, 4932.32] + - - [128, 2048, 1, 256, 160, 160, 288, 288] + - [77, 8253.46] + - - [128, 2048, 1, 512, 160, 160, 544, 544] + - [73, 12729.3] + - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] + - [65, 17271.6] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [67, 20606.1] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [64, 18206.4] + - - [128, 4096, 1, 64, 160, 160, 128, 96] + - [78, 4783.92] + - - [128, 4096, 1, 128, 160, 160, 160, 160] + - [68, 8345.84] + - - [128, 4096, 1, 256, 160, 160, 288, 288] + - [80, 13684.5] + - - [128, 4096, 1, 512, 160, 160, 544, 544] + - [53, 20285.4] + - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] + - [24, 27543.2] + - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] + - [32, 34444.6] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [68, 29748.9] + - - [256, 64, 1, 64, 288, 288, 256, 96] + - [44, 163.457] + - - [256, 64, 1, 128, 288, 288, 256, 160] + - [14, 283.9] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [77, 455.46] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [77, 721.789] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [76, 977.866] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [71, 1197.13] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [77, 1282.91] + - - [256, 128, 1, 64, 288, 288, 256, 96] + - [45, 377.253] + - - [256, 128, 1, 128, 288, 288, 256, 160] + - [10, 682.006] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [77, 1113.73] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [71, 1630.44] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [63, 2126.53] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [67, 2480.46] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [74, 2627.34] + - - [256, 256, 1, 64, 288, 288, 256, 96] + - [49, 802.891] + - - [256, 256, 1, 128, 288, 288, 256, 160] + - [29, 1400.9] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [71, 2296.68] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [71, 3342.76] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [77, 4234.53] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [77, 5028.58] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [71, 5335.53] + - - [256, 512, 1, 64, 288, 288, 256, 96] + - [71, 1605.78] + - - [256, 512, 1, 128, 288, 288, 256, 160] + - [63, 2827.3] + - - [256, 512, 1, 256, 288, 288, 288, 288] + - [63, 4428.47] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [77, 6535.73] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [77, 8538.59] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [74, 10038.7] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [63, 10651.8] + - - [256, 1024, 1, 64, 288, 288, 256, 96] + - [77, 2963.65] + - - [256, 1024, 1, 128, 288, 288, 256, 160] + - [80, 5261.79] + - - [256, 1024, 1, 256, 288, 288, 288, 288] + - [71, 8682.74] + - - [256, 1024, 1, 512, 288, 288, 544, 544] + - [61, 12774.2] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [77, 16608.0] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [65, 20356.1] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [66, 21677.3] + - - [256, 2048, 1, 64, 288, 288, 256, 96] + - [30, 4790.75] + - - [256, 2048, 1, 128, 288, 288, 256, 160] + - [45, 8530.43] + - - [256, 2048, 1, 256, 288, 288, 288, 288] + - [10, 14281.5] + - - [256, 2048, 1, 512, 288, 288, 544, 544] + - [28, 21160.0] + - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] + - [22, 28420.9] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [8, 35298.5] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [77, 31999.0] + - - [256, 4096, 1, 64, 288, 288, 256, 96] + - [71, 9346.64] + - - [256, 4096, 1, 128, 288, 288, 256, 160] + - [24, 15319.9] + - - [256, 4096, 1, 256, 288, 288, 288, 288] + - [45, 24103.1] + - - [256, 4096, 1, 512, 288, 288, 544, 544] + - [45, 30483.3] + - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] + - [24, 36005.1] + - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] + - [44, 38255.1] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [41, 37032.3] + - - [384, 64, 1, 64, 416, 416, 384, 96] + - [44, 250.457] + - - [384, 64, 1, 128, 416, 416, 384, 160] + - [29, 416.267] + - - [384, 64, 1, 256, 416, 416, 384, 288] + - [71, 701.702] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [76, 1074.55] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [70, 1463.55] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [70, 1812.19] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [71, 1934.61] + - - [384, 128, 1, 64, 416, 416, 384, 96] + - [39, 548.516] + - - [384, 128, 1, 128, 416, 416, 384, 160] + - [77, 1038.19] + - - [384, 128, 1, 256, 416, 416, 384, 288] + - [77, 1701.77] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [71, 2468.45] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [63, 3202.58] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [62, 3768.05] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [70, 3949.37] + - - [384, 256, 1, 64, 416, 416, 384, 96] + - [63, 1146.19] + - - [384, 256, 1, 128, 416, 416, 384, 160] + - [29, 2032.45] + - - [384, 256, 1, 256, 416, 416, 384, 288] + - [63, 3489.93] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [71, 5045.79] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [77, 6521.33] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [77, 7541.18] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [76, 8141.82] + - - [384, 512, 1, 64, 416, 416, 384, 96] + - [77, 2310.49] + - - [384, 512, 1, 128, 416, 416, 384, 160] + - [74, 4098.02] + - - [384, 512, 1, 256, 416, 416, 384, 288] + - [61, 6701.08] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [29, 9589.72] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [63, 12523.4] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [63, 15113.5] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [74, 15963.1] + - - [384, 1024, 1, 64, 416, 416, 384, 96] + - [17, 3942.63] + - - [384, 1024, 1, 128, 416, 416, 384, 160] + - [6, 6739.64] + - - [384, 1024, 1, 256, 416, 416, 384, 288] + - [2, 11123.0] + - - [384, 1024, 1, 512, 416, 416, 544, 544] + - [5, 16711.8] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [36, 21789.8] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [6, 25744.3] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [55, 28410.4] + - - [384, 2048, 1, 64, 416, 416, 384, 96] + - [59, 7146.34] + - - [384, 2048, 1, 128, 416, 416, 384, 160] + - [33, 12191.3] + - - [384, 2048, 1, 256, 416, 416, 384, 288] + - [45, 18925.3] + - - [384, 2048, 1, 512, 416, 416, 544, 544] + - [59, 26637.6] + - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] + - [45, 31853.0] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [51, 33147.1] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [50, 36142.8] + - - [384, 4096, 1, 64, 416, 416, 384, 96] + - [15, 12686.0] + - - [384, 4096, 1, 128, 416, 416, 384, 160] + - [32, 20040.5] + - - [384, 4096, 1, 256, 416, 416, 384, 288] + - [6, 27488.7] + - - [384, 4096, 1, 512, 416, 416, 544, 544] + - [44, 33494.5] + - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] + - [0, 36072.0] + - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] + - [45, 38920.6] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [53, 37168.5] + - - [768, 64, 1, 64, 800, 800, 768, 96] + - [4, 519.187] + - - [768, 64, 1, 128, 800, 800, 768, 160] + - [1, 905.115] + - - [768, 64, 1, 256, 800, 800, 768, 288] + - [62, 1460.59] + - - [768, 64, 1, 512, 800, 800, 768, 544] + - [71, 2215.5] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [71, 2979.09] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [71, 3611.11] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [77, 3905.31] + - - [768, 128, 1, 64, 800, 800, 768, 96] + - [57, 1119.08] + - - [768, 128, 1, 128, 800, 800, 768, 160] + - [10, 2030.48] + - - [768, 128, 1, 256, 800, 800, 768, 288] + - [77, 3460.17] + - - [768, 128, 1, 512, 800, 800, 768, 544] + - [77, 5031.16] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [63, 6476.03] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [77, 7572.38] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [64, 7588.36] + - - [768, 256, 1, 64, 800, 800, 768, 96] + - [77, 2205.98] + - - [768, 256, 1, 128, 800, 800, 768, 160] + - [71, 3886.63] + - - [768, 256, 1, 256, 800, 800, 768, 288] + - [77, 6449.47] + - - [768, 256, 1, 512, 800, 800, 768, 544] + - [71, 9565.14] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [77, 12743.8] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [77, 15041.2] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [67, 15194.5] + - - [768, 512, 1, 64, 800, 800, 768, 96] + - [14, 3786.61] + - - [768, 512, 1, 128, 800, 800, 768, 160] + - [71, 6866.55] + - - [768, 512, 1, 256, 800, 800, 768, 288] + - [0, 11513.6] + - - [768, 512, 1, 512, 800, 800, 768, 544] + - [4, 16846.0] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [3, 21862.0] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [0, 25894.1] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [36, 27419.8] + - - [768, 1024, 1, 64, 800, 800, 768, 96] + - [34, 6767.74] + - - [768, 1024, 1, 128, 800, 800, 768, 160] + - [5, 11532.1] + - - [768, 1024, 1, 256, 800, 800, 768, 288] + - [25, 18934.1] + - - [768, 1024, 1, 512, 800, 800, 768, 544] + - [25, 25710.6] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [45, 30780.4] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [42, 33085.8] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [41, 35682.8] + - - [768, 2048, 1, 64, 800, 800, 768, 96] + - [22, 13259.1] + - - [768, 2048, 1, 128, 800, 800, 768, 160] + - [23, 20695.6] + - - [768, 2048, 1, 256, 800, 800, 768, 288] + - [44, 27503.7] + - - [768, 2048, 1, 512, 800, 800, 768, 544] + - [25, 34032.3] + - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] + - [25, 36592.4] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [2, 39840.6] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [25, 40606.7] + - - [768, 4096, 1, 64, 800, 800, 768, 96] + - [44, 18355.8] + - - [768, 4096, 1, 128, 800, 800, 768, 160] + - [44, 26355.1] + - - [768, 4096, 1, 256, 800, 800, 768, 288] + - [6, 33211.3] + - - [768, 4096, 1, 512, 800, 800, 768, 544] + - [2, 36022.8] + - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] + - [6, 39866.7] + - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] + - [23, 42018.9] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [22, 40810.0] + - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] + - [5, 1044.92] + - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] + - [9, 1758.37] + - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] + - [48, 2922.18] + - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] + - [70, 4416.61] + - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] + - [76, 5931.5] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [63, 7239.89] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [64, 7182.67] + - - [1536, 128, 1, 64, 1568, 1568, 1536, 96] + - [10, 2125.85] + - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] + - [39, 3885.41] + - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] + - [49, 6359.84] + - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] + - [63, 9418.37] + - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] + - [77, 12395.5] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [63, 15041.8] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [60, 15514.4] + - - [1536, 256, 1, 64, 1568, 1568, 1536, 96] + - [53, 3786.64] + - - [1536, 256, 1, 128, 1568, 1568, 1536, 160] + - [45, 6720.74] + - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] + - [0, 10987.0] + - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] + - [6, 16204.7] + - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] + - [2, 21645.7] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [6, 26488.6] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [18, 24964.2] + - - [1536, 512, 1, 64, 1568, 1568, 1536, 96] + - [34, 6832.04] + - - [1536, 512, 1, 128, 1568, 1568, 1536, 160] + - [59, 11611.9] + - - [1536, 512, 1, 256, 1568, 1568, 1536, 288] + - [13, 19552.0] + - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] + - [16, 26478.2] + - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] + - [6, 31531.2] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [6, 33267.5] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [51, 36010.3] + - - [1536, 1024, 1, 64, 1568, 1568, 1536, 96] + - [52, 13257.4] + - - [1536, 1024, 1, 128, 1568, 1568, 1536, 160] + - [54, 20778.9] + - - [1536, 1024, 1, 256, 1568, 1568, 1536, 288] + - [45, 28500.4] + - - [1536, 1024, 1, 512, 1568, 1568, 1536, 544] + - [44, 34005.0] + - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] + - [25, 36553.3] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [23, 40005.3] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [25, 40401.7] + - - [1536, 2048, 1, 64, 1568, 1568, 1536, 96] + - [45, 17843.4] + - - [1536, 2048, 1, 128, 1568, 1568, 1536, 160] + - [25, 25753.4] + - - [1536, 2048, 1, 256, 1568, 1568, 1536, 288] + - [23, 32806.7] + - - [1536, 2048, 1, 512, 1568, 1568, 1536, 544] + - [23, 36010.7] + - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 1056] + - [23, 39857.8] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [25, 41968.8] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [6, 41102.7] + - - [1536, 4096, 1, 64, 1568, 1568, 1536, 96] + - [50, 22328.7] + - - [1536, 4096, 1, 128, 1568, 1568, 1536, 160] + - [54, 30248.6] + - - [1536, 4096, 1, 256, 1568, 1568, 1536, 288] + - [23, 34817.3] + - - [1536, 4096, 1, 512, 1568, 1568, 1536, 544] + - [44, 39279.0] + - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 1056] + - [25, 41810.0] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] + - [44, 42416.8] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [25, 41505.7] + - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] + - [40, 2025.58] + - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] + - [7, 3677.08] + - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] + - [46, 5695.56] + - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] + - [46, 8489.8] + - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] + - [7, 11028.6] + - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] + - [7, 13281.9] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [37, 13892.0] + - - [3072, 128, 1, 64, 3104, 3104, 3072, 96] + - [25, 4517.31] + - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] + - [75, 7468.71] + - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] + - [27, 12489.2] + - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] + - [47, 18088.7] + - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] + - [8, 23391.1] + - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] + - [27, 27574.3] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [56, 24550.9] + - - [3072, 256, 1, 64, 3104, 3104, 3072, 96] + - [23, 8080.21] + - - [3072, 256, 1, 128, 3104, 3104, 3072, 160] + - [59, 14051.3] + - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] + - [14, 20655.2] + - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] + - [23, 27078.2] + - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] + - [44, 31407.0] + - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] + - [51, 33095.3] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [20, 36830.0] + - - [3072, 512, 1, 64, 3104, 3104, 3072, 96] + - [34, 12782.6] + - - [3072, 512, 1, 128, 3104, 3104, 3072, 160] + - [45, 20817.6] + - - [3072, 512, 1, 256, 3104, 3104, 3072, 288] + - [45, 28371.8] + - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] + - [25, 34249.4] + - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] + - [44, 36287.4] + - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] + - [44, 39622.0] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [0, 38846.3] + - - [3072, 1024, 1, 64, 3104, 3104, 3072, 96] + - [51, 18282.5] + - - [3072, 1024, 1, 128, 3104, 3104, 3072, 160] + - [44, 26310.4] + - - [3072, 1024, 1, 256, 3104, 3104, 3072, 288] + - [25, 33269.0] + - - [3072, 1024, 1, 512, 3104, 3104, 3072, 544] + - [45, 35983.3] + - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] + - [23, 39745.2] + - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] + - [25, 41865.1] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [59, 40445.5] + - - [3072, 2048, 1, 64, 3104, 3104, 3072, 96] + - [51, 22383.3] + - - [3072, 2048, 1, 128, 3104, 3104, 3072, 160] + - [25, 30054.4] + - - [3072, 2048, 1, 256, 3104, 3104, 3072, 288] + - [23, 34728.7] + - - [3072, 2048, 1, 512, 3104, 3104, 3072, 544] + - [45, 39256.0] + - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 1056] + - [44, 41769.3] + - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] + - [45, 42628.2] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [24, 41025.3] + - - [3072, 4096, 1, 64, 3104, 3104, 3072, 96] + - [50, 25967.6] + - - [3072, 4096, 1, 128, 3104, 3104, 3072, 160] + - [44, 31698.8] + - - [3072, 4096, 1, 256, 3104, 3104, 3072, 288] + - [25, 37702.1] + - - [3072, 4096, 1, 512, 3104, 3104, 3072, 544] + - [23, 40959.6] + - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 1056] + - [54, 42079.5] + - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 2080] + - [44, 42717.8] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [21, 40209.8] + - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] + - [9, 2530.88] + - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] + - [38, 4329.6] + - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] + - [48, 7176.65] + - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] + - [26, 10639.5] + - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] + - [57, 14002.9] + - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] + - [7, 17752.5] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [57, 17227.3] + - - [4096, 128, 1, 64, 4128, 4128, 4096, 96] + - [44, 6028.46] + - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] + - [6, 10596.7] + - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] + - [17, 17060.9] + - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] + - [6, 24385.5] + - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] + - [27, 30782.2] + - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] + - [7, 35920.8] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [48, 33528.8] + - - [4096, 256, 1, 64, 4128, 4128, 4096, 96] + - [45, 9961.24] + - - [4096, 256, 1, 128, 4128, 4128, 4096, 160] + - [30, 16153.3] + - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] + - [44, 23540.8] + - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] + - [53, 30528.4] + - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] + - [45, 36224.9] + - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] + - [25, 37845.9] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [20, 38296.0] + - - [4096, 512, 1, 64, 4128, 4128, 4096, 96] + - [34, 14835.6] + - - [4096, 512, 1, 128, 4128, 4128, 4096, 160] + - [44, 22448.3] + - - [4096, 512, 1, 256, 4128, 4128, 4096, 288] + - [23, 29799.7] + - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] + - [53, 35931.6] + - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] + - [23, 37623.1] + - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] + - [25, 40661.7] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [59, 39759.2] + - - [4096, 1024, 1, 64, 4128, 4128, 4096, 96] + - [50, 20171.0] + - - [4096, 1024, 1, 128, 4128, 4128, 4096, 160] + - [45, 28238.6] + - - [4096, 1024, 1, 256, 4128, 4128, 4096, 288] + - [44, 34688.3] + - - [4096, 1024, 1, 512, 4128, 4128, 4096, 544] + - [25, 37404.8] + - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] + - [23, 40765.5] + - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] + - [45, 42570.8] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [43, 41104.1] + - - [4096, 2048, 1, 64, 4128, 4128, 4096, 96] + - [31, 23998.5] + - - [4096, 2048, 1, 128, 4128, 4128, 4096, 160] + - [34, 31414.4] + - - [4096, 2048, 1, 256, 4128, 4128, 4096, 288] + - [25, 36192.6] + - - [4096, 2048, 1, 512, 4128, 4128, 4096, 544] + - [45, 39979.6] + - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 1056] + - [44, 42379.5] + - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] + - [25, 42863.8] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [21, 41070.2] + - - [4096, 4096, 1, 64, 4128, 4128, 4096, 96] + - [45, 23824.4] + - - [4096, 4096, 1, 128, 4128, 4128, 4096, 160] + - [53, 30552.7] + - - [4096, 4096, 1, 256, 4128, 4128, 4096, 288] + - [14, 35533.5] + - - [4096, 4096, 1, 512, 4128, 4128, 4096, 544] + - [25, 40993.9] + - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 1056] + - [25, 42468.8] + - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 2080] + - [25, 42915.8] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [30, 37918.4] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_HB_GB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_HB_GB.yaml new file mode 100644 index 00000000000..fad24d41049 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_HB_GB.yaml @@ -0,0 +1,22713 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [2, 36.787] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [29, 61.6809] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [71, 105.778] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [71, 165.352] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [71, 228.523] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [77, 285.911] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [71, 316.438] + - - [64, 128, 1, 64, 96, 96, 96, 96] + - [35, 64.9677] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [29, 119.048] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [29, 204.52] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [77, 324.085] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [71, 455.878] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [71, 574.032] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [80, 633.976] + - - [64, 256, 1, 64, 96, 96, 96, 96] + - [29, 154.361] + - - [64, 256, 1, 128, 96, 96, 160, 160] + - [29, 280.48] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [77, 483.549] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [71, 727.546] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [71, 992.327] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [61, 1191.18] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [77, 1301.8] + - - [64, 512, 1, 64, 96, 96, 96, 96] + - [71, 331.568] + - - [64, 512, 1, 128, 96, 96, 160, 160] + - [77, 597.056] + - - [64, 512, 1, 256, 96, 96, 288, 288] + - [74, 1002.7] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [71, 1479.99] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [77, 2015.04] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [77, 2450.3] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [74, 2626.16] + - - [64, 1024, 1, 64, 96, 96, 96, 96] + - [48, 682.001] + - - [64, 1024, 1, 128, 96, 96, 160, 160] + - [77, 1233.08] + - - [64, 1024, 1, 256, 96, 96, 288, 288] + - [80, 2064.63] + - - [64, 1024, 1, 512, 96, 96, 544, 544] + - [79, 3019.93] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [64, 4102.27] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [80, 5002.53] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [62, 5363.78] + - - [64, 2048, 1, 64, 96, 96, 96, 96] + - [70, 1281.5] + - - [64, 2048, 1, 128, 96, 96, 160, 160] + - [66, 2328.88] + - - [64, 2048, 1, 256, 96, 96, 288, 288] + - [76, 3924.04] + - - [64, 2048, 1, 512, 96, 96, 544, 544] + - [60, 6173.2] + - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] + - [60, 8336.5] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [79, 10165.3] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [67, 11151.6] + - - [64, 4096, 1, 64, 96, 96, 96, 96] + - [11, 2210.15] + - - [64, 4096, 1, 128, 96, 96, 160, 160] + - [72, 3995.05] + - - [64, 4096, 1, 256, 96, 96, 288, 288] + - [58, 6519.22] + - - [64, 4096, 1, 512, 96, 96, 544, 544] + - [13, 9791.93] + - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] + - [15, 13509.6] + - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] + - [12, 16861.5] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [68, 14846.1] + - - [128, 64, 1, 64, 160, 160, 128, 96] + - [6, 73.7603] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [2, 128.691] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [8, 204.421] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [77, 322.341] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [80, 451.364] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [74, 575.667] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [71, 630.285] + - - [128, 128, 1, 64, 160, 160, 128, 96] + - [34, 167.772] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [19, 306.154] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [49, 506.314] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [71, 781.355] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [71, 1036.01] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [80, 1222.96] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [71, 1305.11] + - - [128, 256, 1, 64, 160, 160, 128, 96] + - [44, 386.287] + - - [128, 256, 1, 128, 160, 160, 160, 160] + - [10, 679.13] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [49, 1111.22] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [63, 1634.73] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [67, 2098.2] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [67, 2503.6] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [77, 2658.73] + - - [128, 512, 1, 64, 160, 160, 128, 96] + - [67, 798.767] + - - [128, 512, 1, 128, 160, 160, 160, 160] + - [10, 1399.97] + - - [128, 512, 1, 256, 160, 160, 288, 288] + - [77, 2307.73] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [80, 3356.46] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [63, 4361.97] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [74, 5071.52] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [62, 5336.48] + - - [128, 1024, 1, 64, 160, 160, 128, 96] + - [71, 1512.29] + - - [128, 1024, 1, 128, 160, 160, 160, 160] + - [77, 2705.57] + - - [128, 1024, 1, 256, 160, 160, 288, 288] + - [77, 4459.08] + - - [128, 1024, 1, 512, 160, 160, 544, 544] + - [69, 6578.67] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [65, 8665.37] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [65, 10514.5] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [80, 11038.3] + - - [128, 2048, 1, 64, 160, 160, 128, 96] + - [63, 2750.36] + - - [128, 2048, 1, 128, 160, 160, 160, 160] + - [74, 4932.32] + - - [128, 2048, 1, 256, 160, 160, 288, 288] + - [77, 8253.46] + - - [128, 2048, 1, 512, 160, 160, 544, 544] + - [73, 12729.3] + - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] + - [65, 17271.6] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [67, 20606.1] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [64, 18206.4] + - - [128, 4096, 1, 64, 160, 160, 128, 96] + - [78, 4783.92] + - - [128, 4096, 1, 128, 160, 160, 160, 160] + - [68, 8345.84] + - - [128, 4096, 1, 256, 160, 160, 288, 288] + - [80, 13684.5] + - - [128, 4096, 1, 512, 160, 160, 544, 544] + - [53, 20285.4] + - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] + - [24, 27543.2] + - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] + - [32, 34444.6] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [68, 29748.9] + - - [256, 64, 1, 64, 288, 288, 256, 96] + - [44, 163.457] + - - [256, 64, 1, 128, 288, 288, 256, 160] + - [14, 283.9] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [77, 455.46] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [77, 721.789] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [76, 977.866] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [71, 1197.13] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [77, 1282.91] + - - [256, 128, 1, 64, 288, 288, 256, 96] + - [45, 377.253] + - - [256, 128, 1, 128, 288, 288, 256, 160] + - [10, 682.006] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [77, 1113.73] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [71, 1630.44] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [63, 2126.53] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [67, 2480.46] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [74, 2627.34] + - - [256, 256, 1, 64, 288, 288, 256, 96] + - [49, 802.891] + - - [256, 256, 1, 128, 288, 288, 256, 160] + - [29, 1400.9] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [71, 2296.68] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [71, 3342.76] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [77, 4234.53] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [77, 5028.58] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [71, 5335.53] + - - [256, 512, 1, 64, 288, 288, 256, 96] + - [71, 1605.78] + - - [256, 512, 1, 128, 288, 288, 256, 160] + - [63, 2827.3] + - - [256, 512, 1, 256, 288, 288, 288, 288] + - [63, 4428.47] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [77, 6535.73] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [77, 8538.59] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [74, 10038.7] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [63, 10651.8] + - - [256, 1024, 1, 64, 288, 288, 256, 96] + - [77, 2963.65] + - - [256, 1024, 1, 128, 288, 288, 256, 160] + - [80, 5261.79] + - - [256, 1024, 1, 256, 288, 288, 288, 288] + - [71, 8682.74] + - - [256, 1024, 1, 512, 288, 288, 544, 544] + - [61, 12774.2] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [77, 16608.0] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [65, 20356.1] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [66, 21677.3] + - - [256, 2048, 1, 64, 288, 288, 256, 96] + - [30, 4790.75] + - - [256, 2048, 1, 128, 288, 288, 256, 160] + - [45, 8530.43] + - - [256, 2048, 1, 256, 288, 288, 288, 288] + - [10, 14281.5] + - - [256, 2048, 1, 512, 288, 288, 544, 544] + - [28, 21160.0] + - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] + - [22, 28420.9] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [8, 35298.5] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [77, 31999.0] + - - [256, 4096, 1, 64, 288, 288, 256, 96] + - [71, 9346.64] + - - [256, 4096, 1, 128, 288, 288, 256, 160] + - [24, 15319.9] + - - [256, 4096, 1, 256, 288, 288, 288, 288] + - [45, 24103.1] + - - [256, 4096, 1, 512, 288, 288, 544, 544] + - [45, 30483.3] + - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] + - [24, 36005.1] + - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] + - [44, 38255.1] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [41, 37032.3] + - - [384, 64, 1, 64, 416, 416, 384, 96] + - [44, 250.457] + - - [384, 64, 1, 128, 416, 416, 384, 160] + - [29, 416.267] + - - [384, 64, 1, 256, 416, 416, 384, 288] + - [71, 701.702] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [76, 1074.55] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [70, 1463.55] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [70, 1812.19] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [71, 1934.61] + - - [384, 128, 1, 64, 416, 416, 384, 96] + - [39, 548.516] + - - [384, 128, 1, 128, 416, 416, 384, 160] + - [77, 1038.19] + - - [384, 128, 1, 256, 416, 416, 384, 288] + - [77, 1701.77] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [71, 2468.45] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [63, 3202.58] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [62, 3768.05] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [70, 3949.37] + - - [384, 256, 1, 64, 416, 416, 384, 96] + - [63, 1146.19] + - - [384, 256, 1, 128, 416, 416, 384, 160] + - [29, 2032.45] + - - [384, 256, 1, 256, 416, 416, 384, 288] + - [63, 3489.93] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [71, 5045.79] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [77, 6521.33] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [77, 7541.18] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [76, 8141.82] + - - [384, 512, 1, 64, 416, 416, 384, 96] + - [77, 2310.49] + - - [384, 512, 1, 128, 416, 416, 384, 160] + - [74, 4098.02] + - - [384, 512, 1, 256, 416, 416, 384, 288] + - [61, 6701.08] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [29, 9589.72] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [63, 12523.4] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [63, 15113.5] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [74, 15963.1] + - - [384, 1024, 1, 64, 416, 416, 384, 96] + - [17, 3942.63] + - - [384, 1024, 1, 128, 416, 416, 384, 160] + - [6, 6739.64] + - - [384, 1024, 1, 256, 416, 416, 384, 288] + - [2, 11123.0] + - - [384, 1024, 1, 512, 416, 416, 544, 544] + - [5, 16711.8] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [36, 21789.8] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [6, 25744.3] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [55, 28410.4] + - - [384, 2048, 1, 64, 416, 416, 384, 96] + - [59, 7146.34] + - - [384, 2048, 1, 128, 416, 416, 384, 160] + - [33, 12191.3] + - - [384, 2048, 1, 256, 416, 416, 384, 288] + - [45, 18925.3] + - - [384, 2048, 1, 512, 416, 416, 544, 544] + - [59, 26637.6] + - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] + - [45, 31853.0] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [51, 33147.1] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [50, 36142.8] + - - [384, 4096, 1, 64, 416, 416, 384, 96] + - [15, 12686.0] + - - [384, 4096, 1, 128, 416, 416, 384, 160] + - [32, 20040.5] + - - [384, 4096, 1, 256, 416, 416, 384, 288] + - [6, 27488.7] + - - [384, 4096, 1, 512, 416, 416, 544, 544] + - [44, 33494.5] + - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] + - [0, 36072.0] + - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] + - [45, 38920.6] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [53, 37168.5] + - - [768, 64, 1, 64, 800, 800, 768, 96] + - [4, 519.187] + - - [768, 64, 1, 128, 800, 800, 768, 160] + - [1, 905.115] + - - [768, 64, 1, 256, 800, 800, 768, 288] + - [62, 1460.59] + - - [768, 64, 1, 512, 800, 800, 768, 544] + - [71, 2215.5] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [71, 2979.09] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [71, 3611.11] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [77, 3905.31] + - - [768, 128, 1, 64, 800, 800, 768, 96] + - [57, 1119.08] + - - [768, 128, 1, 128, 800, 800, 768, 160] + - [10, 2030.48] + - - [768, 128, 1, 256, 800, 800, 768, 288] + - [77, 3460.17] + - - [768, 128, 1, 512, 800, 800, 768, 544] + - [77, 5031.16] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [63, 6476.03] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [77, 7572.38] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [64, 7588.36] + - - [768, 256, 1, 64, 800, 800, 768, 96] + - [77, 2205.98] + - - [768, 256, 1, 128, 800, 800, 768, 160] + - [71, 3886.63] + - - [768, 256, 1, 256, 800, 800, 768, 288] + - [77, 6449.47] + - - [768, 256, 1, 512, 800, 800, 768, 544] + - [71, 9565.14] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [77, 12743.8] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [77, 15041.2] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [67, 15194.5] + - - [768, 512, 1, 64, 800, 800, 768, 96] + - [14, 3786.61] + - - [768, 512, 1, 128, 800, 800, 768, 160] + - [71, 6866.55] + - - [768, 512, 1, 256, 800, 800, 768, 288] + - [0, 11513.6] + - - [768, 512, 1, 512, 800, 800, 768, 544] + - [4, 16846.0] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [3, 21862.0] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [0, 25894.1] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [36, 27419.8] + - - [768, 1024, 1, 64, 800, 800, 768, 96] + - [34, 6767.74] + - - [768, 1024, 1, 128, 800, 800, 768, 160] + - [5, 11532.1] + - - [768, 1024, 1, 256, 800, 800, 768, 288] + - [25, 18934.1] + - - [768, 1024, 1, 512, 800, 800, 768, 544] + - [25, 25710.6] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [45, 30780.4] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [42, 33085.8] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [41, 35682.8] + - - [768, 2048, 1, 64, 800, 800, 768, 96] + - [22, 13259.1] + - - [768, 2048, 1, 128, 800, 800, 768, 160] + - [23, 20695.6] + - - [768, 2048, 1, 256, 800, 800, 768, 288] + - [44, 27503.7] + - - [768, 2048, 1, 512, 800, 800, 768, 544] + - [25, 34032.3] + - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] + - [25, 36592.4] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [2, 39840.6] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [25, 40606.7] + - - [768, 4096, 1, 64, 800, 800, 768, 96] + - [44, 18355.8] + - - [768, 4096, 1, 128, 800, 800, 768, 160] + - [44, 26355.1] + - - [768, 4096, 1, 256, 800, 800, 768, 288] + - [6, 33211.3] + - - [768, 4096, 1, 512, 800, 800, 768, 544] + - [2, 36022.8] + - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] + - [6, 39866.7] + - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] + - [23, 42018.9] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [22, 40810.0] + - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] + - [5, 1044.92] + - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] + - [9, 1758.37] + - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] + - [48, 2922.18] + - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] + - [70, 4416.61] + - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] + - [76, 5931.5] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [63, 7239.89] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [64, 7182.67] + - - [1536, 128, 1, 64, 1568, 1568, 1536, 96] + - [10, 2125.85] + - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] + - [39, 3885.41] + - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] + - [49, 6359.84] + - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] + - [63, 9418.37] + - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] + - [77, 12395.5] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [63, 15041.8] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [60, 15514.4] + - - [1536, 256, 1, 64, 1568, 1568, 1536, 96] + - [53, 3786.64] + - - [1536, 256, 1, 128, 1568, 1568, 1536, 160] + - [45, 6720.74] + - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] + - [0, 10987.0] + - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] + - [6, 16204.7] + - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] + - [2, 21645.7] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [6, 26488.6] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [18, 24964.2] + - - [1536, 512, 1, 64, 1568, 1568, 1536, 96] + - [34, 6832.04] + - - [1536, 512, 1, 128, 1568, 1568, 1536, 160] + - [59, 11611.9] + - - [1536, 512, 1, 256, 1568, 1568, 1536, 288] + - [13, 19552.0] + - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] + - [16, 26478.2] + - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] + - [6, 31531.2] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [6, 33267.5] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [51, 36010.3] + - - [1536, 1024, 1, 64, 1568, 1568, 1536, 96] + - [52, 13257.4] + - - [1536, 1024, 1, 128, 1568, 1568, 1536, 160] + - [54, 20778.9] + - - [1536, 1024, 1, 256, 1568, 1568, 1536, 288] + - [45, 28500.4] + - - [1536, 1024, 1, 512, 1568, 1568, 1536, 544] + - [44, 34005.0] + - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] + - [25, 36553.3] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [23, 40005.3] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [25, 40401.7] + - - [1536, 2048, 1, 64, 1568, 1568, 1536, 96] + - [45, 17843.4] + - - [1536, 2048, 1, 128, 1568, 1568, 1536, 160] + - [25, 25753.4] + - - [1536, 2048, 1, 256, 1568, 1568, 1536, 288] + - [23, 32806.7] + - - [1536, 2048, 1, 512, 1568, 1568, 1536, 544] + - [23, 36010.7] + - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 1056] + - [23, 39857.8] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [25, 41968.8] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [6, 41102.7] + - - [1536, 4096, 1, 64, 1568, 1568, 1536, 96] + - [50, 22328.7] + - - [1536, 4096, 1, 128, 1568, 1568, 1536, 160] + - [54, 30248.6] + - - [1536, 4096, 1, 256, 1568, 1568, 1536, 288] + - [23, 34817.3] + - - [1536, 4096, 1, 512, 1568, 1568, 1536, 544] + - [44, 39279.0] + - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 1056] + - [25, 41810.0] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] + - [44, 42416.8] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [25, 41505.7] + - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] + - [40, 2025.58] + - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] + - [7, 3677.08] + - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] + - [46, 5695.56] + - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] + - [46, 8489.8] + - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] + - [7, 11028.6] + - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] + - [7, 13281.9] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [37, 13892.0] + - - [3072, 128, 1, 64, 3104, 3104, 3072, 96] + - [25, 4517.31] + - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] + - [75, 7468.71] + - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] + - [27, 12489.2] + - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] + - [47, 18088.7] + - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] + - [8, 23391.1] + - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] + - [27, 27574.3] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [56, 24550.9] + - - [3072, 256, 1, 64, 3104, 3104, 3072, 96] + - [23, 8080.21] + - - [3072, 256, 1, 128, 3104, 3104, 3072, 160] + - [59, 14051.3] + - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] + - [14, 20655.2] + - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] + - [23, 27078.2] + - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] + - [44, 31407.0] + - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] + - [51, 33095.3] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [20, 36830.0] + - - [3072, 512, 1, 64, 3104, 3104, 3072, 96] + - [34, 12782.6] + - - [3072, 512, 1, 128, 3104, 3104, 3072, 160] + - [45, 20817.6] + - - [3072, 512, 1, 256, 3104, 3104, 3072, 288] + - [45, 28371.8] + - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] + - [25, 34249.4] + - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] + - [44, 36287.4] + - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] + - [44, 39622.0] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [0, 38846.3] + - - [3072, 1024, 1, 64, 3104, 3104, 3072, 96] + - [51, 18282.5] + - - [3072, 1024, 1, 128, 3104, 3104, 3072, 160] + - [44, 26310.4] + - - [3072, 1024, 1, 256, 3104, 3104, 3072, 288] + - [25, 33269.0] + - - [3072, 1024, 1, 512, 3104, 3104, 3072, 544] + - [45, 35983.3] + - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] + - [23, 39745.2] + - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] + - [25, 41865.1] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [59, 40445.5] + - - [3072, 2048, 1, 64, 3104, 3104, 3072, 96] + - [51, 22383.3] + - - [3072, 2048, 1, 128, 3104, 3104, 3072, 160] + - [25, 30054.4] + - - [3072, 2048, 1, 256, 3104, 3104, 3072, 288] + - [23, 34728.7] + - - [3072, 2048, 1, 512, 3104, 3104, 3072, 544] + - [45, 39256.0] + - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 1056] + - [44, 41769.3] + - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] + - [45, 42628.2] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [24, 41025.3] + - - [3072, 4096, 1, 64, 3104, 3104, 3072, 96] + - [50, 25967.6] + - - [3072, 4096, 1, 128, 3104, 3104, 3072, 160] + - [44, 31698.8] + - - [3072, 4096, 1, 256, 3104, 3104, 3072, 288] + - [25, 37702.1] + - - [3072, 4096, 1, 512, 3104, 3104, 3072, 544] + - [23, 40959.6] + - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 1056] + - [54, 42079.5] + - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 2080] + - [44, 42717.8] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [21, 40209.8] + - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] + - [9, 2530.88] + - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] + - [38, 4329.6] + - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] + - [48, 7176.65] + - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] + - [26, 10639.5] + - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] + - [57, 14002.9] + - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] + - [7, 17752.5] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [57, 17227.3] + - - [4096, 128, 1, 64, 4128, 4128, 4096, 96] + - [44, 6028.46] + - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] + - [6, 10596.7] + - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] + - [17, 17060.9] + - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] + - [6, 24385.5] + - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] + - [27, 30782.2] + - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] + - [7, 35920.8] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [48, 33528.8] + - - [4096, 256, 1, 64, 4128, 4128, 4096, 96] + - [45, 9961.24] + - - [4096, 256, 1, 128, 4128, 4128, 4096, 160] + - [30, 16153.3] + - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] + - [44, 23540.8] + - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] + - [53, 30528.4] + - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] + - [45, 36224.9] + - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] + - [25, 37845.9] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [20, 38296.0] + - - [4096, 512, 1, 64, 4128, 4128, 4096, 96] + - [34, 14835.6] + - - [4096, 512, 1, 128, 4128, 4128, 4096, 160] + - [44, 22448.3] + - - [4096, 512, 1, 256, 4128, 4128, 4096, 288] + - [23, 29799.7] + - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] + - [53, 35931.6] + - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] + - [23, 37623.1] + - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] + - [25, 40661.7] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [59, 39759.2] + - - [4096, 1024, 1, 64, 4128, 4128, 4096, 96] + - [50, 20171.0] + - - [4096, 1024, 1, 128, 4128, 4128, 4096, 160] + - [45, 28238.6] + - - [4096, 1024, 1, 256, 4128, 4128, 4096, 288] + - [44, 34688.3] + - - [4096, 1024, 1, 512, 4128, 4128, 4096, 544] + - [25, 37404.8] + - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] + - [23, 40765.5] + - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] + - [45, 42570.8] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [43, 41104.1] + - - [4096, 2048, 1, 64, 4128, 4128, 4096, 96] + - [31, 23998.5] + - - [4096, 2048, 1, 128, 4128, 4128, 4096, 160] + - [34, 31414.4] + - - [4096, 2048, 1, 256, 4128, 4128, 4096, 288] + - [25, 36192.6] + - - [4096, 2048, 1, 512, 4128, 4128, 4096, 544] + - [45, 39979.6] + - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 1056] + - [44, 42379.5] + - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] + - [25, 42863.8] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [21, 41070.2] + - - [4096, 4096, 1, 64, 4128, 4128, 4096, 96] + - [45, 23824.4] + - - [4096, 4096, 1, 128, 4128, 4128, 4096, 160] + - [53, 30552.7] + - - [4096, 4096, 1, 256, 4128, 4128, 4096, 288] + - [14, 35533.5] + - - [4096, 4096, 1, 512, 4128, 4128, 4096, 544] + - [25, 40993.9] + - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 1056] + - [25, 42468.8] + - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 2080] + - [25, 42915.8] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [30, 37918.4] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_HHS_BH.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_HHS_BH.yaml new file mode 100644 index 00000000000..e7ab80da627 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_HHS_BH.yaml @@ -0,0 +1,27843 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [2, 35.3438] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [10, 59.4905] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [80, 99.0533] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [83, 158.228] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [67, 223.232] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [80, 280.387] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [75, 312.98] + - - [64, 128, 1, 64, 96, 96, 96, 96] + - [23, 65.4054] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [23, 114.737] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [83, 198.051] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [67, 314.417] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [80, 444.948] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [68, 561.694] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [79, 624.758] + - - [64, 256, 1, 64, 96, 96, 96, 96] + - [19, 148.776] + - - [64, 256, 1, 128, 96, 96, 160, 160] + - [87, 266.036] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [83, 453.832] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [80, 677.703] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [87, 942.385] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [79, 1182.04] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [79, 1286.9] + - - [64, 512, 1, 64, 96, 96, 96, 96] + - [34, 298.019] + - - [64, 512, 1, 128, 96, 96, 160, 160] + - [68, 543.307] + - - [64, 512, 1, 256, 96, 96, 288, 288] + - [83, 954.23] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [5, 1422.41] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [79, 1943.39] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [90, 2382.8] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [84, 2578.09] + - - [64, 1024, 1, 64, 96, 96, 96, 96] + - [83, 640.841] + - - [64, 1024, 1, 128, 96, 96, 160, 160] + - [78, 1160.25] + - - [64, 1024, 1, 256, 96, 96, 288, 288] + - [66, 1944.06] + - - [64, 1024, 1, 512, 96, 96, 544, 544] + - [86, 2955.31] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [76, 3905.1] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [75, 4816.38] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [71, 5189.58] + - - [64, 2048, 1, 64, 96, 96, 96, 96] + - [83, 1203.02] + - - [64, 2048, 1, 128, 96, 96, 160, 160] + - [77, 2285.42] + - - [64, 2048, 1, 256, 96, 96, 288, 288] + - [83, 3851.97] + - - [64, 2048, 1, 512, 96, 96, 544, 544] + - [79, 5670.86] + - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] + - [85, 7965.46] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [79, 9678.61] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [89, 10660.7] + - - [64, 4096, 1, 64, 96, 96, 96, 96] + - [6, 2186.53] + - - [64, 4096, 1, 128, 96, 96, 160, 160] + - [6, 3834.38] + - - [64, 4096, 1, 256, 96, 96, 288, 288] + - [6, 6552.34] + - - [64, 4096, 1, 512, 96, 96, 544, 544] + - [16, 10422.3] + - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] + - [33, 14322.0] + - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] + - [27, 17879.7] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [14, 18480.7] + - - [128, 64, 1, 64, 160, 160, 128, 96] + - [1, 73.5639] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [11, 118.873] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [87, 204.143] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [9, 317.438] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [68, 445.517] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [73, 559.988] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [75, 622.903] + - - [128, 128, 1, 64, 160, 160, 128, 96] + - [42, 170.751] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [34, 296.46] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [27, 511.19] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [68, 761.289] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [27, 999.126] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [79, 1217.24] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [75, 1299.3] + - - [128, 256, 1, 64, 160, 160, 128, 96] + - [10, 375.363] + - - [128, 256, 1, 128, 160, 160, 160, 160] + - [27, 662.924] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [66, 1085.77] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [7, 1584.11] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [79, 2060.71] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [76, 2473.52] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [67, 2668.41] + - - [128, 512, 1, 64, 160, 160, 128, 96] + - [8, 778.312] + - - [128, 512, 1, 128, 160, 160, 160, 160] + - [68, 1381.52] + - - [128, 512, 1, 256, 160, 160, 288, 288] + - [68, 2269.96] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [7, 3219.89] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [74, 4286.2] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [80, 4974.72] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [85, 5294.81] + - - [128, 1024, 1, 64, 160, 160, 128, 96] + - [22, 1567.38] + - - [128, 1024, 1, 128, 160, 160, 160, 160] + - [68, 2786.46] + - - [128, 1024, 1, 256, 160, 160, 288, 288] + - [68, 4570.23] + - - [128, 1024, 1, 512, 160, 160, 544, 544] + - [76, 6603.95] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [76, 8363.02] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [71, 10050.8] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [70, 10578.1] + - - [128, 2048, 1, 64, 160, 160, 128, 96] + - [27, 2719.17] + - - [128, 2048, 1, 128, 160, 160, 160, 160] + - [87, 5084.79] + - - [128, 2048, 1, 256, 160, 160, 288, 288] + - [80, 8438.18] + - - [128, 2048, 1, 512, 160, 160, 544, 544] + - [79, 12438.0] + - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] + - [90, 16673.1] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [76, 20115.1] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [90, 21675.2] + - - [128, 4096, 1, 64, 160, 160, 128, 96] + - [97, 4774.41] + - - [128, 4096, 1, 128, 160, 160, 160, 160] + - [6, 8442.51] + - - [128, 4096, 1, 256, 160, 160, 288, 288] + - [17, 13930.3] + - - [128, 4096, 1, 512, 160, 160, 544, 544] + - [27, 21452.6] + - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] + - [21, 30345.4] + - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] + - [27, 36544.4] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [27, 34812.1] + - - [256, 64, 1, 64, 288, 288, 256, 96] + - [10, 162.848] + - - [256, 64, 1, 128, 288, 288, 256, 160] + - [78, 267.019] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [78, 453.195] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [78, 695.518] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [83, 937.436] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [68, 1166.63] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [76, 1265.4] + - - [256, 128, 1, 64, 288, 288, 256, 96] + - [56, 367.214] + - - [256, 128, 1, 128, 288, 288, 256, 160] + - [78, 666.506] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [27, 1088.16] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [83, 1545.01] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [85, 2095.06] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [88, 2436.52] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [85, 2614.71] + - - [256, 256, 1, 64, 288, 288, 256, 96] + - [27, 781.794] + - - [256, 256, 1, 128, 288, 288, 256, 160] + - [87, 1389.08] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [7, 2230.43] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [83, 3285.14] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [85, 4186.2] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [79, 5019.76] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [79, 5269.76] + - - [256, 512, 1, 64, 288, 288, 256, 96] + - [99, 1557.49] + - - [256, 512, 1, 128, 288, 288, 256, 160] + - [68, 2791.09] + - - [256, 512, 1, 256, 288, 288, 288, 288] + - [78, 4560.9] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [7, 6477.72] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [85, 8405.43] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [82, 9929.21] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [90, 10515.4] + - - [256, 1024, 1, 64, 288, 288, 256, 96] + - [68, 2916.77] + - - [256, 1024, 1, 128, 288, 288, 256, 160] + - [83, 5190.17] + - - [256, 1024, 1, 256, 288, 288, 288, 288] + - [7, 8400.16] + - - [256, 1024, 1, 512, 288, 288, 544, 544] + - [77, 12208.3] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [71, 16455.4] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [87, 19471.7] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [85, 21216.1] + - - [256, 2048, 1, 64, 288, 288, 256, 96] + - [56, 4964.43] + - - [256, 2048, 1, 128, 288, 288, 256, 160] + - [17, 8974.2] + - - [256, 2048, 1, 256, 288, 288, 288, 288] + - [27, 14845.5] + - - [256, 2048, 1, 512, 288, 288, 544, 544] + - [17, 22319.5] + - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] + - [17, 29610.7] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [17, 36750.7] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [26, 39371.0] + - - [256, 4096, 1, 64, 288, 288, 256, 96] + - [92, 8456.26] + - - [256, 4096, 1, 128, 288, 288, 256, 160] + - [18, 14592.1] + - - [256, 4096, 1, 256, 288, 288, 288, 288] + - [36, 23638.3] + - - [256, 4096, 1, 512, 288, 288, 544, 544] + - [58, 31683.3] + - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] + - [47, 36785.9] + - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] + - [62, 39229.3] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [65, 38514.8] + - - [384, 64, 1, 64, 416, 416, 384, 96] + - [11, 238.639] + - - [384, 64, 1, 128, 416, 416, 384, 160] + - [27, 412.285] + - - [384, 64, 1, 256, 416, 416, 384, 288] + - [83, 674.691] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [78, 1060.24] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [85, 1426.07] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [79, 1773.06] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [71, 1903.95] + - - [384, 128, 1, 64, 416, 416, 384, 96] + - [27, 560.139] + - - [384, 128, 1, 128, 416, 416, 384, 160] + - [27, 1016.89] + - - [384, 128, 1, 256, 416, 416, 384, 288] + - [72, 1655.87] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [68, 2424.46] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [87, 3153.62] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [84, 3688.8] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [82, 3913.14] + - - [384, 256, 1, 64, 416, 416, 384, 96] + - [17, 1193.15] + - - [384, 256, 1, 128, 416, 416, 384, 160] + - [5, 2066.5] + - - [384, 256, 1, 256, 416, 416, 384, 288] + - [68, 3279.37] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [83, 4937.88] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [71, 6428.48] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [84, 7479.0] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [71, 7907.75] + - - [384, 512, 1, 64, 416, 416, 384, 96] + - [21, 2154.99] + - - [384, 512, 1, 128, 416, 416, 384, 160] + - [74, 3838.01] + - - [384, 512, 1, 256, 416, 416, 384, 288] + - [78, 6580.2] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [71, 9646.74] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [69, 12321.9] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [69, 14693.3] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [81, 15659.0] + - - [384, 1024, 1, 64, 416, 416, 384, 96] + - [80, 3818.22] + - - [384, 1024, 1, 128, 416, 416, 384, 160] + - [5, 7041.39] + - - [384, 1024, 1, 256, 416, 416, 384, 288] + - [7, 11616.0] + - - [384, 1024, 1, 512, 416, 416, 544, 544] + - [6, 16951.0] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [7, 23068.1] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [16, 27417.6] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [7, 29648.4] + - - [384, 2048, 1, 64, 416, 416, 384, 96] + - [37, 6673.54] + - - [384, 2048, 1, 128, 416, 416, 384, 160] + - [96, 12415.3] + - - [384, 2048, 1, 256, 416, 416, 384, 288] + - [57, 21413.3] + - - [384, 2048, 1, 512, 416, 416, 544, 544] + - [53, 28504.6] + - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] + - [59, 35455.8] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [57, 38689.8] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [50, 40022.8] + - - [384, 4096, 1, 64, 416, 416, 384, 96] + - [22, 11686.1] + - - [384, 4096, 1, 128, 416, 416, 384, 160] + - [30, 19168.6] + - - [384, 4096, 1, 256, 416, 416, 384, 288] + - [12, 26745.6] + - - [384, 4096, 1, 512, 416, 416, 544, 544] + - [13, 32553.6] + - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] + - [3, 35289.7] + - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] + - [2, 38532.5] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [25, 37187.9] + - - [768, 64, 1, 64, 800, 800, 768, 96] + - [1, 489.153] + - - [768, 64, 1, 128, 800, 800, 768, 160] + - [68, 827.282] + - - [768, 64, 1, 256, 800, 800, 768, 288] + - [78, 1393.92] + - - [768, 64, 1, 512, 800, 800, 768, 544] + - [68, 2180.94] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [7, 2891.14] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [85, 3566.85] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [71, 3823.73] + - - [768, 128, 1, 64, 800, 800, 768, 96] + - [27, 1092.08] + - - [768, 128, 1, 128, 800, 800, 768, 160] + - [93, 2065.83] + - - [768, 128, 1, 256, 800, 800, 768, 288] + - [68, 3411.4] + - - [768, 128, 1, 512, 800, 800, 768, 544] + - [79, 4928.21] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [85, 6244.26] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [76, 7352.01] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [76, 7789.5] + - - [768, 256, 1, 64, 800, 800, 768, 96] + - [42, 2291.98] + - - [768, 256, 1, 128, 800, 800, 768, 160] + - [27, 4029.77] + - - [768, 256, 1, 256, 800, 800, 768, 288] + - [79, 6560.43] + - - [768, 256, 1, 512, 800, 800, 768, 544] + - [7, 9468.0] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [79, 12546.1] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [76, 14596.3] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [84, 15715.5] + - - [768, 512, 1, 64, 800, 800, 768, 96] + - [7, 3751.05] + - - [768, 512, 1, 128, 800, 800, 768, 160] + - [32, 6741.47] + - - [768, 512, 1, 256, 800, 800, 768, 288] + - [17, 11218.5] + - - [768, 512, 1, 512, 800, 800, 768, 544] + - [27, 17317.0] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [27, 22698.8] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [27, 27706.3] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [27, 29548.3] + - - [768, 1024, 1, 64, 800, 800, 768, 96] + - [29, 7174.89] + - - [768, 1024, 1, 128, 800, 800, 768, 160] + - [95, 12006.6] + - - [768, 1024, 1, 256, 800, 800, 768, 288] + - [96, 19163.1] + - - [768, 1024, 1, 512, 800, 800, 768, 544] + - [39, 29144.0] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [55, 35034.8] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [35, 37956.8] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [38, 40750.9] + - - [768, 2048, 1, 64, 800, 800, 768, 96] + - [25, 12988.8] + - - [768, 2048, 1, 128, 800, 800, 768, 160] + - [11, 19550.2] + - - [768, 2048, 1, 256, 800, 800, 768, 288] + - [65, 27524.4] + - - [768, 2048, 1, 512, 800, 800, 768, 544] + - [2, 33781.1] + - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] + - [37, 36378.4] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [56, 40192.6] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [58, 41995.9] + - - [768, 4096, 1, 64, 800, 800, 768, 96] + - [60, 17393.3] + - - [768, 4096, 1, 128, 800, 800, 768, 160] + - [63, 25542.6] + - - [768, 4096, 1, 256, 800, 800, 768, 288] + - [40, 32287.2] + - - [768, 4096, 1, 512, 800, 800, 768, 544] + - [58, 35993.8] + - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] + - [45, 40278.5] + - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] + - [42, 42644.7] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [60, 42741.3] + - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] + - [9, 983.809] + - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] + - [27, 1747.87] + - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] + - [9, 2885.34] + - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] + - [85, 4350.19] + - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] + - [79, 5848.11] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [70, 7060.63] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [82, 7795.5] + - - [1536, 128, 1, 64, 1568, 1568, 1536, 96] + - [27, 2216.08] + - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] + - [17, 4002.85] + - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] + - [68, 6505.32] + - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] + - [79, 9543.38] + - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] + - [7, 12204.6] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [71, 14911.4] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [82, 15818.6] + - - [1536, 256, 1, 64, 1568, 1568, 1536, 96] + - [17, 3796.32] + - - [1536, 256, 1, 128, 1568, 1568, 1536, 160] + - [33, 7098.96] + - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] + - [27, 11779.0] + - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] + - [27, 16995.3] + - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] + - [27, 22810.6] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [7, 27881.7] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [6, 29707.4] + - - [1536, 512, 1, 64, 1568, 1568, 1536, 96] + - [94, 6461.89] + - - [1536, 512, 1, 128, 1568, 1568, 1536, 160] + - [98, 12026.7] + - - [1536, 512, 1, 256, 1568, 1568, 1536, 288] + - [43, 19672.3] + - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] + - [43, 28391.8] + - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] + - [46, 35477.6] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [43, 37564.4] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [50, 39385.5] + - - [1536, 1024, 1, 64, 1568, 1568, 1536, 96] + - [24, 12490.8] + - - [1536, 1024, 1, 128, 1568, 1568, 1536, 160] + - [31, 19632.0] + - - [1536, 1024, 1, 256, 1568, 1568, 1536, 288] + - [47, 27721.4] + - - [1536, 1024, 1, 512, 1568, 1568, 1536, 544] + - [60, 33755.6] + - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] + - [56, 36597.3] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [42, 40184.9] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [45, 42014.7] + - - [1536, 2048, 1, 64, 1568, 1568, 1536, 96] + - [96, 17813.4] + - - [1536, 2048, 1, 128, 1568, 1568, 1536, 160] + - [62, 25087.4] + - - [1536, 2048, 1, 256, 1568, 1568, 1536, 288] + - [65, 32311.8] + - - [1536, 2048, 1, 512, 1568, 1568, 1536, 544] + - [45, 36053.4] + - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 1056] + - [37, 40237.1] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [56, 42683.6] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [58, 43264.3] + - - [1536, 4096, 1, 64, 1568, 1568, 1536, 96] + - [51, 21697.0] + - - [1536, 4096, 1, 128, 1568, 1568, 1536, 160] + - [65, 29651.5] + - - [1536, 4096, 1, 256, 1568, 1568, 1536, 288] + - [62, 34229.7] + - - [1536, 4096, 1, 512, 1568, 1568, 1536, 544] + - [40, 38868.4] + - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 1056] + - [60, 41956.9] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] + - [54, 43003.1] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [37, 43321.6] + - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] + - [0, 1783.29] + - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] + - [64, 3207.06] + - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] + - [26, 5506.73] + - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] + - [32, 8069.19] + - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] + - [4, 11091.7] + - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] + - [4, 13544.6] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [6, 14702.6] + - - [3072, 128, 1, 64, 3104, 3104, 3072, 96] + - [32, 4081.39] + - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] + - [26, 7329.47] + - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] + - [7, 12504.8] + - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] + - [26, 17778.7] + - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] + - [26, 23416.8] + - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] + - [7, 28229.6] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [16, 29952.6] + - - [3072, 256, 1, 64, 3104, 3104, 3072, 96] + - [11, 8056.9] + - - [3072, 256, 1, 128, 3104, 3104, 3072, 160] + - [59, 13443.3] + - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] + - [41, 21435.9] + - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] + - [53, 28453.9] + - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] + - [57, 35023.9] + - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] + - [46, 38268.6] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [28, 37941.3] + - - [3072, 512, 1, 64, 3104, 3104, 3072, 96] + - [23, 12455.2] + - - [3072, 512, 1, 128, 3104, 3104, 3072, 160] + - [13, 19616.7] + - - [3072, 512, 1, 256, 3104, 3104, 3072, 288] + - [45, 27865.2] + - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] + - [47, 33788.1] + - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] + - [42, 36544.1] + - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] + - [42, 40250.1] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [47, 41325.5] + - - [3072, 1024, 1, 64, 3104, 3104, 3072, 96] + - [47, 17316.9] + - - [3072, 1024, 1, 128, 3104, 3104, 3072, 160] + - [65, 25113.9] + - - [3072, 1024, 1, 256, 3104, 3104, 3072, 288] + - [52, 32737.3] + - - [3072, 1024, 1, 512, 3104, 3104, 3072, 544] + - [56, 36306.1] + - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] + - [56, 40172.9] + - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] + - [37, 42758.9] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [45, 43195.4] + - - [3072, 2048, 1, 64, 3104, 3104, 3072, 96] + - [62, 21694.6] + - - [3072, 2048, 1, 128, 3104, 3104, 3072, 160] + - [62, 29738.0] + - - [3072, 2048, 1, 256, 3104, 3104, 3072, 288] + - [61, 34272.7] + - - [3072, 2048, 1, 512, 3104, 3104, 3072, 544] + - [48, 38863.2] + - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 1056] + - [47, 41971.9] + - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] + - [42, 43036.4] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [37, 42925.6] + - - [3072, 4096, 1, 64, 3104, 3104, 3072, 96] + - [51, 25010.2] + - - [3072, 4096, 1, 128, 3104, 3104, 3072, 160] + - [49, 30886.0] + - - [3072, 4096, 1, 256, 3104, 3104, 3072, 288] + - [48, 36642.7] + - - [3072, 4096, 1, 512, 3104, 3104, 3072, 544] + - [49, 40414.3] + - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 1056] + - [45, 42176.5] + - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 2080] + - [42, 43321.3] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [44, 41457.0] + - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] + - [4, 2489.94] + - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] + - [32, 4083.03] + - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] + - [16, 6885.06] + - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] + - [26, 10896.0] + - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] + - [16, 14619.1] + - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] + - [6, 18105.7] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [20, 19414.2] + - - [4096, 128, 1, 64, 4128, 4128, 4096, 96] + - [10, 5634.67] + - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] + - [9, 10540.1] + - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] + - [17, 17084.7] + - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] + - [7, 24790.8] + - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] + - [17, 31513.9] + - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] + - [15, 37249.0] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [21, 38655.1] + - - [4096, 256, 1, 64, 4128, 4128, 4096, 96] + - [48, 10338.8] + - - [4096, 256, 1, 128, 4128, 4128, 4096, 160] + - [40, 16927.4] + - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] + - [60, 24593.2] + - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] + - [58, 31291.6] + - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] + - [60, 37382.6] + - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] + - [40, 39381.7] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [34, 41215.3] + - - [4096, 512, 1, 64, 4128, 4128, 4096, 96] + - [54, 14514.7] + - - [4096, 512, 1, 128, 4128, 4128, 4096, 160] + - [61, 22022.8] + - - [4096, 512, 1, 256, 4128, 4128, 4096, 288] + - [40, 30156.2] + - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] + - [42, 35698.5] + - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] + - [60, 38757.6] + - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] + - [40, 41716.1] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [46, 42775.1] + - - [4096, 1024, 1, 64, 4128, 4128, 4096, 96] + - [61, 19273.1] + - - [4096, 1024, 1, 128, 4128, 4128, 4096, 160] + - [65, 27091.4] + - - [4096, 1024, 1, 256, 4128, 4128, 4096, 288] + - [61, 33915.8] + - - [4096, 1024, 1, 512, 4128, 4128, 4096, 544] + - [62, 37419.7] + - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] + - [54, 41089.5] + - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] + - [60, 43202.6] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [58, 42751.4] + - - [4096, 2048, 1, 64, 4128, 4128, 4096, 96] + - [61, 23254.2] + - - [4096, 2048, 1, 128, 4128, 4128, 4096, 160] + - [56, 30652.9] + - - [4096, 2048, 1, 256, 4128, 4128, 4096, 288] + - [62, 35241.6] + - - [4096, 2048, 1, 512, 4128, 4128, 4096, 544] + - [58, 39681.5] + - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 1056] + - [58, 42476.9] + - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] + - [42, 42935.5] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [61, 42847.9] + - - [4096, 4096, 1, 64, 4128, 4128, 4096, 96] + - [58, 23153.0] + - - [4096, 4096, 1, 128, 4128, 4128, 4096, 160] + - [60, 30122.8] + - - [4096, 4096, 1, 256, 4128, 4128, 4096, 288] + - [91, 35267.4] + - - [4096, 4096, 1, 512, 4128, 4128, 4096, 544] + - [47, 40597.4] + - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 1056] + - [47, 41985.1] + - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 2080] + - [40, 43431.8] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [99, 40503.4] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_HHS_BH_GB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_HHS_BH_GB.yaml new file mode 100644 index 00000000000..e66185c35ea --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_HHS_BH_GB.yaml @@ -0,0 +1,27843 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 1 + LSPB: 8 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 1 + LSPB: 4 + LVCA: 32 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [2, 35.3438] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [10, 59.4905] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [80, 99.0533] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [83, 158.228] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [67, 223.232] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [80, 280.387] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [75, 312.98] + - - [64, 128, 1, 64, 96, 96, 96, 96] + - [23, 65.4054] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [23, 114.737] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [83, 198.051] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [67, 314.417] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [80, 444.948] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [68, 561.694] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [79, 624.758] + - - [64, 256, 1, 64, 96, 96, 96, 96] + - [19, 148.776] + - - [64, 256, 1, 128, 96, 96, 160, 160] + - [87, 266.036] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [83, 453.832] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [80, 677.703] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [87, 942.385] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [79, 1182.04] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [79, 1286.9] + - - [64, 512, 1, 64, 96, 96, 96, 96] + - [34, 298.019] + - - [64, 512, 1, 128, 96, 96, 160, 160] + - [68, 543.307] + - - [64, 512, 1, 256, 96, 96, 288, 288] + - [83, 954.23] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [5, 1422.41] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [79, 1943.39] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [90, 2382.8] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [84, 2578.09] + - - [64, 1024, 1, 64, 96, 96, 96, 96] + - [83, 640.841] + - - [64, 1024, 1, 128, 96, 96, 160, 160] + - [78, 1160.25] + - - [64, 1024, 1, 256, 96, 96, 288, 288] + - [66, 1944.06] + - - [64, 1024, 1, 512, 96, 96, 544, 544] + - [86, 2955.31] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [76, 3905.1] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [75, 4816.38] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [71, 5189.58] + - - [64, 2048, 1, 64, 96, 96, 96, 96] + - [83, 1203.02] + - - [64, 2048, 1, 128, 96, 96, 160, 160] + - [77, 2285.42] + - - [64, 2048, 1, 256, 96, 96, 288, 288] + - [83, 3851.97] + - - [64, 2048, 1, 512, 96, 96, 544, 544] + - [79, 5670.86] + - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] + - [85, 7965.46] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [79, 9678.61] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [89, 10660.7] + - - [64, 4096, 1, 64, 96, 96, 96, 96] + - [6, 2186.53] + - - [64, 4096, 1, 128, 96, 96, 160, 160] + - [6, 3834.38] + - - [64, 4096, 1, 256, 96, 96, 288, 288] + - [6, 6552.34] + - - [64, 4096, 1, 512, 96, 96, 544, 544] + - [16, 10422.3] + - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] + - [33, 14322.0] + - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] + - [27, 17879.7] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [14, 18480.7] + - - [128, 64, 1, 64, 160, 160, 128, 96] + - [1, 73.5639] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [11, 118.873] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [87, 204.143] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [9, 317.438] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [68, 445.517] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [73, 559.988] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [75, 622.903] + - - [128, 128, 1, 64, 160, 160, 128, 96] + - [42, 170.751] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [34, 296.46] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [27, 511.19] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [68, 761.289] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [27, 999.126] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [79, 1217.24] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [75, 1299.3] + - - [128, 256, 1, 64, 160, 160, 128, 96] + - [10, 375.363] + - - [128, 256, 1, 128, 160, 160, 160, 160] + - [27, 662.924] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [66, 1085.77] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [7, 1584.11] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [79, 2060.71] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [76, 2473.52] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [67, 2668.41] + - - [128, 512, 1, 64, 160, 160, 128, 96] + - [8, 778.312] + - - [128, 512, 1, 128, 160, 160, 160, 160] + - [68, 1381.52] + - - [128, 512, 1, 256, 160, 160, 288, 288] + - [68, 2269.96] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [7, 3219.89] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [74, 4286.2] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [80, 4974.72] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [85, 5294.81] + - - [128, 1024, 1, 64, 160, 160, 128, 96] + - [22, 1567.38] + - - [128, 1024, 1, 128, 160, 160, 160, 160] + - [68, 2786.46] + - - [128, 1024, 1, 256, 160, 160, 288, 288] + - [68, 4570.23] + - - [128, 1024, 1, 512, 160, 160, 544, 544] + - [76, 6603.95] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [76, 8363.02] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [71, 10050.8] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [70, 10578.1] + - - [128, 2048, 1, 64, 160, 160, 128, 96] + - [27, 2719.17] + - - [128, 2048, 1, 128, 160, 160, 160, 160] + - [87, 5084.79] + - - [128, 2048, 1, 256, 160, 160, 288, 288] + - [80, 8438.18] + - - [128, 2048, 1, 512, 160, 160, 544, 544] + - [79, 12438.0] + - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] + - [90, 16673.1] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [76, 20115.1] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [90, 21675.2] + - - [128, 4096, 1, 64, 160, 160, 128, 96] + - [97, 4774.41] + - - [128, 4096, 1, 128, 160, 160, 160, 160] + - [6, 8442.51] + - - [128, 4096, 1, 256, 160, 160, 288, 288] + - [17, 13930.3] + - - [128, 4096, 1, 512, 160, 160, 544, 544] + - [27, 21452.6] + - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] + - [21, 30345.4] + - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] + - [27, 36544.4] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [27, 34812.1] + - - [256, 64, 1, 64, 288, 288, 256, 96] + - [10, 162.848] + - - [256, 64, 1, 128, 288, 288, 256, 160] + - [78, 267.019] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [78, 453.195] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [78, 695.518] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [83, 937.436] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [68, 1166.63] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [76, 1265.4] + - - [256, 128, 1, 64, 288, 288, 256, 96] + - [56, 367.214] + - - [256, 128, 1, 128, 288, 288, 256, 160] + - [78, 666.506] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [27, 1088.16] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [83, 1545.01] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [85, 2095.06] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [88, 2436.52] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [85, 2614.71] + - - [256, 256, 1, 64, 288, 288, 256, 96] + - [27, 781.794] + - - [256, 256, 1, 128, 288, 288, 256, 160] + - [87, 1389.08] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [7, 2230.43] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [83, 3285.14] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [85, 4186.2] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [79, 5019.76] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [79, 5269.76] + - - [256, 512, 1, 64, 288, 288, 256, 96] + - [99, 1557.49] + - - [256, 512, 1, 128, 288, 288, 256, 160] + - [68, 2791.09] + - - [256, 512, 1, 256, 288, 288, 288, 288] + - [78, 4560.9] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [7, 6477.72] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [85, 8405.43] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [82, 9929.21] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [90, 10515.4] + - - [256, 1024, 1, 64, 288, 288, 256, 96] + - [68, 2916.77] + - - [256, 1024, 1, 128, 288, 288, 256, 160] + - [83, 5190.17] + - - [256, 1024, 1, 256, 288, 288, 288, 288] + - [7, 8400.16] + - - [256, 1024, 1, 512, 288, 288, 544, 544] + - [77, 12208.3] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [71, 16455.4] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [87, 19471.7] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [85, 21216.1] + - - [256, 2048, 1, 64, 288, 288, 256, 96] + - [56, 4964.43] + - - [256, 2048, 1, 128, 288, 288, 256, 160] + - [17, 8974.2] + - - [256, 2048, 1, 256, 288, 288, 288, 288] + - [27, 14845.5] + - - [256, 2048, 1, 512, 288, 288, 544, 544] + - [17, 22319.5] + - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] + - [17, 29610.7] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [17, 36750.7] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [26, 39371.0] + - - [256, 4096, 1, 64, 288, 288, 256, 96] + - [92, 8456.26] + - - [256, 4096, 1, 128, 288, 288, 256, 160] + - [18, 14592.1] + - - [256, 4096, 1, 256, 288, 288, 288, 288] + - [36, 23638.3] + - - [256, 4096, 1, 512, 288, 288, 544, 544] + - [58, 31683.3] + - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] + - [47, 36785.9] + - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] + - [62, 39229.3] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [65, 38514.8] + - - [384, 64, 1, 64, 416, 416, 384, 96] + - [11, 238.639] + - - [384, 64, 1, 128, 416, 416, 384, 160] + - [27, 412.285] + - - [384, 64, 1, 256, 416, 416, 384, 288] + - [83, 674.691] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [78, 1060.24] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [85, 1426.07] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [79, 1773.06] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [71, 1903.95] + - - [384, 128, 1, 64, 416, 416, 384, 96] + - [27, 560.139] + - - [384, 128, 1, 128, 416, 416, 384, 160] + - [27, 1016.89] + - - [384, 128, 1, 256, 416, 416, 384, 288] + - [72, 1655.87] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [68, 2424.46] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [87, 3153.62] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [84, 3688.8] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [82, 3913.14] + - - [384, 256, 1, 64, 416, 416, 384, 96] + - [17, 1193.15] + - - [384, 256, 1, 128, 416, 416, 384, 160] + - [5, 2066.5] + - - [384, 256, 1, 256, 416, 416, 384, 288] + - [68, 3279.37] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [83, 4937.88] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [71, 6428.48] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [84, 7479.0] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [71, 7907.75] + - - [384, 512, 1, 64, 416, 416, 384, 96] + - [21, 2154.99] + - - [384, 512, 1, 128, 416, 416, 384, 160] + - [74, 3838.01] + - - [384, 512, 1, 256, 416, 416, 384, 288] + - [78, 6580.2] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [71, 9646.74] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [69, 12321.9] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [69, 14693.3] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [81, 15659.0] + - - [384, 1024, 1, 64, 416, 416, 384, 96] + - [80, 3818.22] + - - [384, 1024, 1, 128, 416, 416, 384, 160] + - [5, 7041.39] + - - [384, 1024, 1, 256, 416, 416, 384, 288] + - [7, 11616.0] + - - [384, 1024, 1, 512, 416, 416, 544, 544] + - [6, 16951.0] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [7, 23068.1] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [16, 27417.6] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [7, 29648.4] + - - [384, 2048, 1, 64, 416, 416, 384, 96] + - [37, 6673.54] + - - [384, 2048, 1, 128, 416, 416, 384, 160] + - [96, 12415.3] + - - [384, 2048, 1, 256, 416, 416, 384, 288] + - [57, 21413.3] + - - [384, 2048, 1, 512, 416, 416, 544, 544] + - [53, 28504.6] + - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] + - [59, 35455.8] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [57, 38689.8] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [50, 40022.8] + - - [384, 4096, 1, 64, 416, 416, 384, 96] + - [22, 11686.1] + - - [384, 4096, 1, 128, 416, 416, 384, 160] + - [30, 19168.6] + - - [384, 4096, 1, 256, 416, 416, 384, 288] + - [12, 26745.6] + - - [384, 4096, 1, 512, 416, 416, 544, 544] + - [13, 32553.6] + - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] + - [3, 35289.7] + - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] + - [2, 38532.5] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [25, 37187.9] + - - [768, 64, 1, 64, 800, 800, 768, 96] + - [1, 489.153] + - - [768, 64, 1, 128, 800, 800, 768, 160] + - [68, 827.282] + - - [768, 64, 1, 256, 800, 800, 768, 288] + - [78, 1393.92] + - - [768, 64, 1, 512, 800, 800, 768, 544] + - [68, 2180.94] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [7, 2891.14] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [85, 3566.85] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [71, 3823.73] + - - [768, 128, 1, 64, 800, 800, 768, 96] + - [27, 1092.08] + - - [768, 128, 1, 128, 800, 800, 768, 160] + - [93, 2065.83] + - - [768, 128, 1, 256, 800, 800, 768, 288] + - [68, 3411.4] + - - [768, 128, 1, 512, 800, 800, 768, 544] + - [79, 4928.21] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [85, 6244.26] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [76, 7352.01] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [76, 7789.5] + - - [768, 256, 1, 64, 800, 800, 768, 96] + - [42, 2291.98] + - - [768, 256, 1, 128, 800, 800, 768, 160] + - [27, 4029.77] + - - [768, 256, 1, 256, 800, 800, 768, 288] + - [79, 6560.43] + - - [768, 256, 1, 512, 800, 800, 768, 544] + - [7, 9468.0] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [79, 12546.1] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [76, 14596.3] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [84, 15715.5] + - - [768, 512, 1, 64, 800, 800, 768, 96] + - [7, 3751.05] + - - [768, 512, 1, 128, 800, 800, 768, 160] + - [32, 6741.47] + - - [768, 512, 1, 256, 800, 800, 768, 288] + - [17, 11218.5] + - - [768, 512, 1, 512, 800, 800, 768, 544] + - [27, 17317.0] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [27, 22698.8] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [27, 27706.3] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [27, 29548.3] + - - [768, 1024, 1, 64, 800, 800, 768, 96] + - [29, 7174.89] + - - [768, 1024, 1, 128, 800, 800, 768, 160] + - [95, 12006.6] + - - [768, 1024, 1, 256, 800, 800, 768, 288] + - [96, 19163.1] + - - [768, 1024, 1, 512, 800, 800, 768, 544] + - [39, 29144.0] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [55, 35034.8] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [35, 37956.8] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [38, 40750.9] + - - [768, 2048, 1, 64, 800, 800, 768, 96] + - [25, 12988.8] + - - [768, 2048, 1, 128, 800, 800, 768, 160] + - [11, 19550.2] + - - [768, 2048, 1, 256, 800, 800, 768, 288] + - [65, 27524.4] + - - [768, 2048, 1, 512, 800, 800, 768, 544] + - [2, 33781.1] + - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] + - [37, 36378.4] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [56, 40192.6] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [58, 41995.9] + - - [768, 4096, 1, 64, 800, 800, 768, 96] + - [60, 17393.3] + - - [768, 4096, 1, 128, 800, 800, 768, 160] + - [63, 25542.6] + - - [768, 4096, 1, 256, 800, 800, 768, 288] + - [40, 32287.2] + - - [768, 4096, 1, 512, 800, 800, 768, 544] + - [58, 35993.8] + - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] + - [45, 40278.5] + - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] + - [42, 42644.7] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [60, 42741.3] + - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] + - [9, 983.809] + - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] + - [27, 1747.87] + - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] + - [9, 2885.34] + - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] + - [85, 4350.19] + - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] + - [79, 5848.11] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [70, 7060.63] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [82, 7795.5] + - - [1536, 128, 1, 64, 1568, 1568, 1536, 96] + - [27, 2216.08] + - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] + - [17, 4002.85] + - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] + - [68, 6505.32] + - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] + - [79, 9543.38] + - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] + - [7, 12204.6] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [71, 14911.4] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [82, 15818.6] + - - [1536, 256, 1, 64, 1568, 1568, 1536, 96] + - [17, 3796.32] + - - [1536, 256, 1, 128, 1568, 1568, 1536, 160] + - [33, 7098.96] + - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] + - [27, 11779.0] + - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] + - [27, 16995.3] + - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] + - [27, 22810.6] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [7, 27881.7] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [6, 29707.4] + - - [1536, 512, 1, 64, 1568, 1568, 1536, 96] + - [94, 6461.89] + - - [1536, 512, 1, 128, 1568, 1568, 1536, 160] + - [98, 12026.7] + - - [1536, 512, 1, 256, 1568, 1568, 1536, 288] + - [43, 19672.3] + - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] + - [43, 28391.8] + - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] + - [46, 35477.6] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [43, 37564.4] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [50, 39385.5] + - - [1536, 1024, 1, 64, 1568, 1568, 1536, 96] + - [24, 12490.8] + - - [1536, 1024, 1, 128, 1568, 1568, 1536, 160] + - [31, 19632.0] + - - [1536, 1024, 1, 256, 1568, 1568, 1536, 288] + - [47, 27721.4] + - - [1536, 1024, 1, 512, 1568, 1568, 1536, 544] + - [60, 33755.6] + - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] + - [56, 36597.3] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [42, 40184.9] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [45, 42014.7] + - - [1536, 2048, 1, 64, 1568, 1568, 1536, 96] + - [96, 17813.4] + - - [1536, 2048, 1, 128, 1568, 1568, 1536, 160] + - [62, 25087.4] + - - [1536, 2048, 1, 256, 1568, 1568, 1536, 288] + - [65, 32311.8] + - - [1536, 2048, 1, 512, 1568, 1568, 1536, 544] + - [45, 36053.4] + - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 1056] + - [37, 40237.1] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [56, 42683.6] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [58, 43264.3] + - - [1536, 4096, 1, 64, 1568, 1568, 1536, 96] + - [51, 21697.0] + - - [1536, 4096, 1, 128, 1568, 1568, 1536, 160] + - [65, 29651.5] + - - [1536, 4096, 1, 256, 1568, 1568, 1536, 288] + - [62, 34229.7] + - - [1536, 4096, 1, 512, 1568, 1568, 1536, 544] + - [40, 38868.4] + - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 1056] + - [60, 41956.9] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] + - [54, 43003.1] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [37, 43321.6] + - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] + - [0, 1783.29] + - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] + - [64, 3207.06] + - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] + - [26, 5506.73] + - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] + - [32, 8069.19] + - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] + - [4, 11091.7] + - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] + - [4, 13544.6] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [6, 14702.6] + - - [3072, 128, 1, 64, 3104, 3104, 3072, 96] + - [32, 4081.39] + - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] + - [26, 7329.47] + - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] + - [7, 12504.8] + - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] + - [26, 17778.7] + - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] + - [26, 23416.8] + - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] + - [7, 28229.6] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [16, 29952.6] + - - [3072, 256, 1, 64, 3104, 3104, 3072, 96] + - [11, 8056.9] + - - [3072, 256, 1, 128, 3104, 3104, 3072, 160] + - [59, 13443.3] + - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] + - [41, 21435.9] + - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] + - [53, 28453.9] + - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] + - [57, 35023.9] + - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] + - [46, 38268.6] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [28, 37941.3] + - - [3072, 512, 1, 64, 3104, 3104, 3072, 96] + - [23, 12455.2] + - - [3072, 512, 1, 128, 3104, 3104, 3072, 160] + - [13, 19616.7] + - - [3072, 512, 1, 256, 3104, 3104, 3072, 288] + - [45, 27865.2] + - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] + - [47, 33788.1] + - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] + - [42, 36544.1] + - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] + - [42, 40250.1] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [47, 41325.5] + - - [3072, 1024, 1, 64, 3104, 3104, 3072, 96] + - [47, 17316.9] + - - [3072, 1024, 1, 128, 3104, 3104, 3072, 160] + - [65, 25113.9] + - - [3072, 1024, 1, 256, 3104, 3104, 3072, 288] + - [52, 32737.3] + - - [3072, 1024, 1, 512, 3104, 3104, 3072, 544] + - [56, 36306.1] + - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] + - [56, 40172.9] + - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] + - [37, 42758.9] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [45, 43195.4] + - - [3072, 2048, 1, 64, 3104, 3104, 3072, 96] + - [62, 21694.6] + - - [3072, 2048, 1, 128, 3104, 3104, 3072, 160] + - [62, 29738.0] + - - [3072, 2048, 1, 256, 3104, 3104, 3072, 288] + - [61, 34272.7] + - - [3072, 2048, 1, 512, 3104, 3104, 3072, 544] + - [48, 38863.2] + - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 1056] + - [47, 41971.9] + - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] + - [42, 43036.4] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [37, 42925.6] + - - [3072, 4096, 1, 64, 3104, 3104, 3072, 96] + - [51, 25010.2] + - - [3072, 4096, 1, 128, 3104, 3104, 3072, 160] + - [49, 30886.0] + - - [3072, 4096, 1, 256, 3104, 3104, 3072, 288] + - [48, 36642.7] + - - [3072, 4096, 1, 512, 3104, 3104, 3072, 544] + - [49, 40414.3] + - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 1056] + - [45, 42176.5] + - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 2080] + - [42, 43321.3] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [44, 41457.0] + - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] + - [4, 2489.94] + - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] + - [32, 4083.03] + - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] + - [16, 6885.06] + - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] + - [26, 10896.0] + - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] + - [16, 14619.1] + - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] + - [6, 18105.7] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [20, 19414.2] + - - [4096, 128, 1, 64, 4128, 4128, 4096, 96] + - [10, 5634.67] + - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] + - [9, 10540.1] + - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] + - [17, 17084.7] + - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] + - [7, 24790.8] + - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] + - [17, 31513.9] + - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] + - [15, 37249.0] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [21, 38655.1] + - - [4096, 256, 1, 64, 4128, 4128, 4096, 96] + - [48, 10338.8] + - - [4096, 256, 1, 128, 4128, 4128, 4096, 160] + - [40, 16927.4] + - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] + - [60, 24593.2] + - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] + - [58, 31291.6] + - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] + - [60, 37382.6] + - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] + - [40, 39381.7] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [34, 41215.3] + - - [4096, 512, 1, 64, 4128, 4128, 4096, 96] + - [54, 14514.7] + - - [4096, 512, 1, 128, 4128, 4128, 4096, 160] + - [61, 22022.8] + - - [4096, 512, 1, 256, 4128, 4128, 4096, 288] + - [40, 30156.2] + - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] + - [42, 35698.5] + - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] + - [60, 38757.6] + - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] + - [40, 41716.1] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [46, 42775.1] + - - [4096, 1024, 1, 64, 4128, 4128, 4096, 96] + - [61, 19273.1] + - - [4096, 1024, 1, 128, 4128, 4128, 4096, 160] + - [65, 27091.4] + - - [4096, 1024, 1, 256, 4128, 4128, 4096, 288] + - [61, 33915.8] + - - [4096, 1024, 1, 512, 4128, 4128, 4096, 544] + - [62, 37419.7] + - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] + - [54, 41089.5] + - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] + - [60, 43202.6] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [58, 42751.4] + - - [4096, 2048, 1, 64, 4128, 4128, 4096, 96] + - [61, 23254.2] + - - [4096, 2048, 1, 128, 4128, 4128, 4096, 160] + - [56, 30652.9] + - - [4096, 2048, 1, 256, 4128, 4128, 4096, 288] + - [62, 35241.6] + - - [4096, 2048, 1, 512, 4128, 4128, 4096, 544] + - [58, 39681.5] + - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 1056] + - [58, 42476.9] + - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] + - [42, 42935.5] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [61, 42847.9] + - - [4096, 4096, 1, 64, 4128, 4128, 4096, 96] + - [58, 23153.0] + - - [4096, 4096, 1, 128, 4128, 4128, 4096, 160] + - [60, 30122.8] + - - [4096, 4096, 1, 256, 4128, 4128, 4096, 288] + - [91, 35267.4] + - - [4096, 4096, 1, 512, 4128, 4128, 4096, 544] + - [47, 40597.4] + - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 1056] + - [47, 41985.1] + - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 2080] + - [40, 43431.8] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [99, 40503.4] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_I8II_BH.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_I8II_BH.yaml new file mode 100644 index 00000000000..ab0bbd8a9ca --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_I8II_BH.yaml @@ -0,0 +1,28113 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 24832 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4352 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8448 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4352 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8448 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4352 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 24832 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4352 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [1, 36.4646] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [18, 61.6157] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [16, 106.357] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [57, 165.953] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [56, 234.804] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [57, 302.751] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [69, 342.009] + - - [64, 128, 1, 64, 96, 96, 96, 96] + - [27, 66.9845] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [16, 122.971] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [50, 212.456] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [10, 333.729] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [56, 475.114] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [56, 603.736] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [56, 678.073] + - - [64, 256, 1, 64, 96, 96, 96, 96] + - [30, 152.987] + - - [64, 256, 1, 128, 96, 96, 160, 160] + - [9, 279.807] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [56, 462.133] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [56, 729.254] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [57, 1005.11] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [69, 1263.73] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [87, 1426.73] + - - [64, 512, 1, 64, 96, 96, 96, 96] + - [86, 327.937] + - - [64, 512, 1, 128, 96, 96, 160, 160] + - [56, 598.93] + - - [64, 512, 1, 256, 96, 96, 288, 288] + - [69, 1017.66] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [56, 1569.43] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [95, 2130.17] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [69, 2585.39] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [56, 2898.37] + - - [64, 1024, 1, 64, 96, 96, 96, 96] + - [69, 678.363] + - - [64, 1024, 1, 128, 96, 96, 160, 160] + - [79, 1226.4] + - - [64, 1024, 1, 256, 96, 96, 288, 288] + - [15, 2020.87] + - - [64, 1024, 1, 512, 96, 96, 544, 544] + - [79, 3073.6] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [79, 4307.93] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [56, 5174.17] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [69, 5820.88] + - - [64, 2048, 1, 64, 96, 96, 96, 96] + - [95, 1267.74] + - - [64, 2048, 1, 128, 96, 96, 160, 160] + - [86, 2280.75] + - - [64, 2048, 1, 256, 96, 96, 288, 288] + - [79, 3761.3] + - - [64, 2048, 1, 512, 96, 96, 544, 544] + - [95, 5872.85] + - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] + - [69, 8268.72] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [62, 10099.5] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [69, 11490.8] + - - [64, 4096, 1, 64, 96, 96, 96, 96] + - [32, 2119.67] + - - [64, 4096, 1, 128, 96, 96, 160, 160] + - [2, 3783.77] + - - [64, 4096, 1, 256, 96, 96, 288, 288] + - [33, 6294.81] + - - [64, 4096, 1, 512, 96, 96, 544, 544] + - [47, 9922.23] + - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] + - [21, 13753.9] + - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] + - [50, 17211.9] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [21, 18627.1] + - - [128, 64, 1, 64, 160, 160, 128, 96] + - [4, 64.0156] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [27, 122.943] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [27, 212.37] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [79, 326.837] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [57, 482.27] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [63, 600.667] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [96, 681.212] + - - [128, 128, 1, 64, 160, 160, 128, 96] + - [25, 182.742] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [31, 334.154] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [96, 534.715] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [87, 821.929] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [63, 1099.14] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [87, 1311.03] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [63, 1468.5] + - - [128, 256, 1, 64, 160, 160, 128, 96] + - [31, 395.316] + - - [128, 256, 1, 128, 160, 160, 160, 160] + - [29, 697.076] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [87, 1106.39] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [56, 1663.09] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [96, 2274.88] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [87, 2670.79] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [96, 2961.36] + - - [128, 512, 1, 64, 160, 160, 128, 96] + - [57, 807.529] + - - [128, 512, 1, 128, 160, 160, 160, 160] + - [87, 1425.43] + - - [128, 512, 1, 256, 160, 160, 288, 288] + - [10, 2310.91] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [69, 3390.72] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [87, 4593.04] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [57, 5393.53] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [69, 5953.86] + - - [128, 1024, 1, 64, 160, 160, 128, 96] + - [86, 1467.31] + - - [128, 1024, 1, 128, 160, 160, 160, 160] + - [80, 2616.13] + - - [128, 1024, 1, 256, 160, 160, 288, 288] + - [57, 4378.2] + - - [128, 1024, 1, 512, 160, 160, 544, 544] + - [70, 6425.61] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [70, 8702.45] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [87, 10634.9] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [57, 11760.6] + - - [128, 2048, 1, 64, 160, 160, 128, 96] + - [56, 2577.15] + - - [128, 2048, 1, 128, 160, 160, 160, 160] + - [73, 4849.64] + - - [128, 2048, 1, 256, 160, 160, 288, 288] + - [69, 7956.94] + - - [128, 2048, 1, 512, 160, 160, 544, 544] + - [70, 12220.5] + - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] + - [96, 16674.0] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [96, 20479.6] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [95, 23351.4] + - - [128, 4096, 1, 64, 160, 160, 128, 96] + - [56, 4976.19] + - - [128, 4096, 1, 128, 160, 160, 160, 160] + - [54, 9346.64] + - - [128, 4096, 1, 256, 160, 160, 288, 288] + - [19, 14421.2] + - - [128, 4096, 1, 512, 160, 160, 544, 544] + - [97, 22911.9] + - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] + - [52, 30915.1] + - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] + - [2, 36468.6] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [2, 38064.5] + - - [256, 64, 1, 64, 288, 288, 256, 96] + - [2, 162.823] + - - [256, 64, 1, 128, 288, 288, 256, 160] + - [15, 268.419] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [56, 460.356] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [56, 746.185] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [56, 1014.71] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [96, 1275.69] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [69, 1422.28] + - - [256, 128, 1, 64, 288, 288, 256, 96] + - [43, 391.698] + - - [256, 128, 1, 128, 288, 288, 256, 160] + - [31, 694.306] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [56, 1155.94] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [56, 1719.68] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [69, 2227.02] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [79, 2702.74] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [95, 2980.24] + - - [256, 256, 1, 64, 288, 288, 256, 96] + - [96, 754.782] + - - [256, 256, 1, 128, 288, 288, 256, 160] + - [69, 1347.57] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [69, 2257.73] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [86, 3482.93] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [69, 4517.6] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [96, 5444.5] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [86, 6009.98] + - - [256, 512, 1, 64, 288, 288, 256, 96] + - [28, 1443.08] + - - [256, 512, 1, 128, 288, 288, 256, 160] + - [95, 2505.93] + - - [256, 512, 1, 256, 288, 288, 288, 288] + - [87, 4221.21] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [87, 6617.58] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [56, 8807.54] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [96, 10603.8] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [80, 11754.9] + - - [256, 1024, 1, 64, 288, 288, 256, 96] + - [11, 2571.61] + - - [256, 1024, 1, 128, 288, 288, 256, 160] + - [12, 4605.96] + - - [256, 1024, 1, 256, 288, 288, 288, 288] + - [69, 7776.23] + - - [256, 1024, 1, 512, 288, 288, 544, 544] + - [69, 12336.2] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [70, 16512.0] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [86, 20510.8] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [95, 23165.5] + - - [256, 2048, 1, 64, 288, 288, 256, 96] + - [95, 4740.67] + - - [256, 2048, 1, 128, 288, 288, 256, 160] + - [35, 8823.15] + - - [256, 2048, 1, 256, 288, 288, 288, 288] + - [24, 14671.9] + - - [256, 2048, 1, 512, 288, 288, 544, 544] + - [38, 22021.0] + - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] + - [88, 29947.7] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [44, 36415.3] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [2, 38295.3] + - - [256, 4096, 1, 64, 288, 288, 256, 96] + - [86, 9478.65] + - - [256, 4096, 1, 128, 288, 288, 256, 160] + - [89, 15957.4] + - - [256, 4096, 1, 256, 288, 288, 288, 288] + - [19, 22608.9] + - - [256, 4096, 1, 512, 288, 288, 544, 544] + - [76, 29594.4] + - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] + - [61, 35018.7] + - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] + - [91, 37462.2] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [99, 40120.4] + - - [384, 64, 1, 64, 416, 416, 384, 96] + - [14, 251.017] + - - [384, 64, 1, 128, 416, 416, 384, 160] + - [26, 431.394] + - - [384, 64, 1, 256, 416, 416, 384, 288] + - [98, 720.836] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [56, 1145.88] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [56, 1550.19] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [56, 1936.95] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [56, 2150.38] + - - [384, 128, 1, 64, 416, 416, 384, 96] + - [87, 596.689] + - - [384, 128, 1, 128, 416, 416, 384, 160] + - [87, 1058.64] + - - [384, 128, 1, 256, 416, 416, 384, 288] + - [13, 1729.85] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [69, 2614.63] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [87, 3348.52] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [62, 4071.32] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [69, 4475.92] + - - [384, 256, 1, 64, 416, 416, 384, 96] + - [57, 1101.83] + - - [384, 256, 1, 128, 416, 416, 384, 160] + - [70, 2065.83] + - - [384, 256, 1, 256, 416, 416, 384, 288] + - [56, 3419.27] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [70, 5109.31] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [86, 6658.08] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [87, 8087.04] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [87, 8924.45] + - - [384, 512, 1, 64, 416, 416, 384, 96] + - [70, 1954.78] + - - [384, 512, 1, 128, 416, 416, 384, 160] + - [63, 3559.52] + - - [384, 512, 1, 256, 416, 416, 384, 288] + - [63, 6024.14] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [57, 9550.6] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [70, 12682.8] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [80, 15623.1] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [79, 17438.4] + - - [384, 1024, 1, 64, 416, 416, 384, 96] + - [96, 4041.42] + - - [384, 1024, 1, 128, 416, 416, 384, 160] + - [42, 7122.09] + - - [384, 1024, 1, 256, 416, 416, 384, 288] + - [53, 11299.1] + - - [384, 1024, 1, 512, 416, 416, 544, 544] + - [23, 17297.6] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [5, 22698.8] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [53, 27014.7] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [51, 28693.8] + - - [384, 2048, 1, 64, 416, 416, 384, 96] + - [87, 7737.38] + - - [384, 2048, 1, 128, 416, 416, 384, 160] + - [89, 13477.5] + - - [384, 2048, 1, 256, 416, 416, 384, 288] + - [52, 20140.8] + - - [384, 2048, 1, 512, 416, 416, 544, 544] + - [76, 27226.6] + - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] + - [75, 34287.3] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [75, 36698.3] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [66, 39379.8] + - - [384, 4096, 1, 64, 416, 416, 384, 96] + - [94, 12294.0] + - - [384, 4096, 1, 128, 416, 416, 384, 160] + - [92, 19603.4] + - - [384, 4096, 1, 256, 416, 416, 384, 288] + - [67, 26502.6] + - - [384, 4096, 1, 512, 416, 416, 544, 544] + - [3, 32352.0] + - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] + - [24, 35150.1] + - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] + - [5, 38319.7] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [0, 39902.5] + - - [768, 64, 1, 64, 800, 800, 768, 96] + - [3, 521.336] + - - [768, 64, 1, 128, 800, 800, 768, 160] + - [69, 911.937] + - - [768, 64, 1, 256, 800, 800, 768, 288] + - [95, 1548.48] + - - [768, 64, 1, 512, 800, 800, 768, 544] + - [69, 2366.99] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [95, 3198.91] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [86, 3854.62] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [95, 4325.24] + - - [768, 128, 1, 64, 800, 800, 768, 96] + - [56, 1136.87] + - - [768, 128, 1, 128, 800, 800, 768, 160] + - [70, 2036.41] + - - [768, 128, 1, 256, 800, 800, 768, 288] + - [17, 3333.67] + - - [768, 128, 1, 512, 800, 800, 768, 544] + - [56, 4946.6] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [87, 6696.17] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [87, 7940.01] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [70, 8809.26] + - - [768, 256, 1, 64, 800, 800, 768, 96] + - [57, 1955.69] + - - [768, 256, 1, 128, 800, 800, 768, 160] + - [56, 3699.22] + - - [768, 256, 1, 256, 800, 800, 768, 288] + - [40, 6082.41] + - - [768, 256, 1, 512, 800, 800, 768, 544] + - [62, 9286.28] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [87, 12858.6] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [62, 15355.6] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [62, 17321.8] + - - [768, 512, 1, 64, 800, 800, 768, 96] + - [69, 3629.35] + - - [768, 512, 1, 128, 800, 800, 768, 160] + - [41, 7139.24] + - - [768, 512, 1, 256, 800, 800, 768, 288] + - [8, 10793.8] + - - [768, 512, 1, 512, 800, 800, 768, 544] + - [71, 16843.2] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [46, 22742.4] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [2, 26940.6] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [24, 28935.1] + - - [768, 1024, 1, 64, 800, 800, 768, 96] + - [86, 8121.94] + - - [768, 1024, 1, 128, 800, 800, 768, 160] + - [89, 13502.8] + - - [768, 1024, 1, 256, 800, 800, 768, 288] + - [91, 20195.3] + - - [768, 1024, 1, 512, 800, 800, 768, 544] + - [91, 27929.1] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [91, 33825.1] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [91, 36410.4] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [91, 39323.6] + - - [768, 2048, 1, 64, 800, 800, 768, 96] + - [84, 12286.5] + - - [768, 2048, 1, 128, 800, 800, 768, 160] + - [35, 19217.9] + - - [768, 2048, 1, 256, 800, 800, 768, 288] + - [50, 25969.3] + - - [768, 2048, 1, 512, 800, 800, 768, 544] + - [75, 33185.3] + - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] + - [75, 35955.2] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [75, 39112.5] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [75, 41087.1] + - - [768, 4096, 1, 64, 800, 800, 768, 96] + - [90, 17225.1] + - - [768, 4096, 1, 128, 800, 800, 768, 160] + - [93, 24667.9] + - - [768, 4096, 1, 256, 800, 800, 768, 288] + - [93, 31551.0] + - - [768, 4096, 1, 512, 800, 800, 768, 544] + - [91, 35398.1] + - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] + - [91, 38977.7] + - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] + - [75, 41188.8] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [75, 41797.4] + - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] + - [3, 962.585] + - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] + - [95, 1767.76] + - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] + - [79, 3030.94] + - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] + - [56, 4549.96] + - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] + - [79, 6253.55] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [86, 7565.26] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [80, 8153.03] + - - [1536, 128, 1, 64, 1568, 1568, 1536, 96] + - [30, 1928.43] + - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] + - [62, 3691.63] + - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] + - [62, 6226.1] + - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] + - [86, 9224.16] + - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] + - [69, 12645.3] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [96, 15299.0] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [70, 17380.5] + - - [1536, 256, 1, 64, 1568, 1568, 1536, 96] + - [48, 3743.81] + - - [1536, 256, 1, 128, 1568, 1568, 1536, 160] + - [34, 6733.33] + - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] + - [71, 11227.2] + - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] + - [44, 16777.3] + - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] + - [88, 22415.7] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [3, 27039.1] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [22, 28858.4] + - - [1536, 512, 1, 64, 1568, 1568, 1536, 96] + - [95, 8089.34] + - - [1536, 512, 1, 128, 1568, 1568, 1536, 160] + - [72, 13493.8] + - - [1536, 512, 1, 256, 1568, 1568, 1536, 288] + - [82, 19548.2] + - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] + - [91, 28024.4] + - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] + - [75, 33843.5] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [91, 36464.0] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [60, 39425.1] + - - [1536, 1024, 1, 64, 1568, 1568, 1536, 96] + - [78, 12372.6] + - - [1536, 1024, 1, 128, 1568, 1568, 1536, 160] + - [36, 19315.7] + - - [1536, 1024, 1, 256, 1568, 1568, 1536, 288] + - [36, 26011.2] + - - [1536, 1024, 1, 512, 1568, 1568, 1536, 544] + - [75, 33351.6] + - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] + - [75, 36068.0] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [75, 39152.5] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [75, 41098.1] + - - [1536, 2048, 1, 64, 1568, 1568, 1536, 96] + - [78, 16872.9] + - - [1536, 2048, 1, 128, 1568, 1568, 1536, 160] + - [78, 24683.0] + - - [1536, 2048, 1, 256, 1568, 1568, 1536, 288] + - [93, 31600.6] + - - [1536, 2048, 1, 512, 1568, 1568, 1536, 544] + - [75, 35412.1] + - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 1056] + - [75, 39036.7] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [75, 41217.2] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [75, 41741.7] + - - [1536, 4096, 1, 64, 1568, 1568, 1536, 96] + - [98, 21902.4] + - - [1536, 4096, 1, 128, 1568, 1568, 1536, 160] + - [36, 29299.9] + - - [1536, 4096, 1, 256, 1568, 1568, 1536, 288] + - [36, 33728.7] + - - [1536, 4096, 1, 512, 1568, 1568, 1536, 544] + - [75, 38410.1] + - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 1056] + - [75, 40913.8] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] + - [75, 41475.7] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [61, 41726.0] + - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] + - [48, 1550.39] + - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] + - [33, 3030.93] + - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] + - [26, 5128.05] + - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] + - [21, 7643.38] + - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] + - [50, 10644.3] + - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] + - [9, 12953.7] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [21, 14306.7] + - - [3072, 128, 1, 64, 3104, 3104, 3072, 96] + - [55, 3807.81] + - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] + - [9, 6232.25] + - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] + - [52, 10402.4] + - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] + - [37, 17426.3] + - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] + - [6, 22940.6] + - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] + - [39, 27226.6] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [45, 28950.2] + - - [3072, 256, 1, 64, 3104, 3104, 3072, 96] + - [86, 7738.57] + - - [3072, 256, 1, 128, 3104, 3104, 3072, 160] + - [77, 13586.6] + - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] + - [35, 20181.1] + - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] + - [100, 27539.4] + - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] + - [76, 34329.7] + - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] + - [60, 36726.7] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [82, 39650.8] + - - [3072, 512, 1, 64, 3104, 3104, 3072, 96] + - [77, 11863.7] + - - [3072, 512, 1, 128, 3104, 3104, 3072, 160] + - [77, 18996.7] + - - [3072, 512, 1, 256, 3104, 3104, 3072, 288] + - [50, 26614.7] + - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] + - [76, 32837.5] + - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] + - [75, 35822.5] + - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] + - [60, 39309.2] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [91, 40947.9] + - - [3072, 1024, 1, 64, 3104, 3104, 3072, 96] + - [74, 17251.6] + - - [3072, 1024, 1, 128, 3104, 3104, 3072, 160] + - [93, 25246.3] + - - [3072, 1024, 1, 256, 3104, 3104, 3072, 288] + - [20, 31507.8] + - - [3072, 1024, 1, 512, 3104, 3104, 3072, 544] + - [75, 35200.1] + - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] + - [75, 39209.7] + - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] + - [75, 41255.5] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [60, 41762.8] + - - [3072, 2048, 1, 64, 3104, 3104, 3072, 96] + - [81, 21704.1] + - - [3072, 2048, 1, 128, 3104, 3104, 3072, 160] + - [85, 29683.3] + - - [3072, 2048, 1, 256, 3104, 3104, 3072, 288] + - [82, 33793.9] + - - [3072, 2048, 1, 512, 3104, 3104, 3072, 544] + - [99, 38315.6] + - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 1056] + - [75, 40925.8] + - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] + - [60, 41684.5] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [75, 41418.4] + - - [3072, 4096, 1, 64, 3104, 3104, 3072, 96] + - [64, 6641.44] + - - [3072, 4096, 1, 128, 3104, 3104, 3072, 160] + - [65, 12983.6] + - - [3072, 4096, 1, 256, 3104, 3104, 3072, 288] + - [65, 25239.4] + - - [3072, 4096, 1, 512, 3104, 3104, 3072, 544] + - [63, 38579.9] + - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 1056] + - [67, 40575.1] + - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 2080] + - [75, 41624.3] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [76, 41686.9] + - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] + - [9, 2286.04] + - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] + - [7, 4202.71] + - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] + - [33, 7061.14] + - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] + - [47, 11003.3] + - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] + - [14, 14683.8] + - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] + - [3, 17541.4] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [2, 18721.0] + - - [4096, 128, 1, 64, 4128, 4128, 4096, 96] + - [15, 4943.93] + - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] + - [49, 9607.64] + - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] + - [35, 14687.9] + - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] + - [97, 22601.3] + - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] + - [5, 30625.9] + - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] + - [3, 36577.9] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [2, 38162.6] + - - [4096, 256, 1, 64, 4128, 4128, 4096, 96] + - [69, 9396.37] + - - [4096, 256, 1, 128, 4128, 4128, 4096, 160] + - [72, 16176.7] + - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] + - [35, 22869.0] + - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] + - [66, 30585.8] + - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] + - [91, 35786.7] + - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] + - [75, 37887.2] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [99, 40129.4] + - - [4096, 512, 1, 64, 4128, 4128, 4096, 96] + - [52, 14114.8] + - - [4096, 512, 1, 128, 4128, 4128, 4096, 160] + - [59, 21877.4] + - - [4096, 512, 1, 256, 4128, 4128, 4096, 288] + - [83, 28425.4] + - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] + - [60, 35024.4] + - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] + - [91, 37284.7] + - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] + - [75, 40275.0] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [60, 41785.8] + - - [4096, 1024, 1, 64, 4128, 4128, 4096, 96] + - [35, 18693.3] + - - [4096, 1024, 1, 128, 4128, 4128, 4096, 160] + - [85, 26867.8] + - - [4096, 1024, 1, 256, 4128, 4128, 4096, 288] + - [35, 33306.7] + - - [4096, 1024, 1, 512, 4128, 4128, 4096, 544] + - [60, 36590.3] + - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] + - [60, 39935.4] + - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] + - [75, 41775.6] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [35, 41003.6] + - - [4096, 2048, 1, 64, 4128, 4128, 4096, 96] + - [90, 22677.7] + - - [4096, 2048, 1, 128, 4128, 4128, 4096, 160] + - [68, 30743.4] + - - [4096, 2048, 1, 256, 4128, 4128, 4096, 288] + - [50, 34783.9] + - - [4096, 2048, 1, 512, 4128, 4128, 4096, 544] + - [52, 38051.7] + - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 1056] + - [56, 40550.6] + - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] + - [61, 41121.9] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [76, 42286.6] + - - [4096, 4096, 1, 64, 4128, 4128, 4096, 96] + - [58, 6883.54] + - - [4096, 4096, 1, 128, 4128, 4128, 4096, 160] + - [58, 13780.3] + - - [4096, 4096, 1, 256, 4128, 4128, 4096, 288] + - [65, 26062.2] + - - [4096, 4096, 1, 512, 4128, 4128, 4096, 544] + - [63, 39162.5] + - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 1056] + - [3, 40101.9] + - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 2080] + - [76, 41557.7] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [76, 41818.3] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_I8II_BH_GB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_I8II_BH_GB.yaml new file mode 100644 index 00000000000..66968af19af --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_I8II_BH_GB.yaml @@ -0,0 +1,28113 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 2 + LSPB: 16 + LVCA: 16 + LVCB: 2 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12416 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2304 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 16 + LSPA: 4 + LSPB: 32 + LVCA: 8 + LVCB: 1 + LVPA: 1 + LVPB: 2 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2560 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 24832 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4352 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8448 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4352 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 8448 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4352 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 24832 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 4352 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 2 + LSPB: 8 + LVCA: 16 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 95 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 96 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 97 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPB32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 98 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 99 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 128 + LSCB: 32 + LSPA: 4 + LSPB: 16 + LVCA: 8 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4096 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4096 + LdsOffsetB_Blk: 20480 + LdsPadA: 0 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 100 + SolutionNameMin: Cijk_Ailk_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [1, 36.4646] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [18, 61.6157] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [16, 106.357] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [57, 165.953] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [56, 234.804] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [57, 302.751] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [69, 342.009] + - - [64, 128, 1, 64, 96, 96, 96, 96] + - [27, 66.9845] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [16, 122.971] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [50, 212.456] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [10, 333.729] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [56, 475.114] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [56, 603.736] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [56, 678.073] + - - [64, 256, 1, 64, 96, 96, 96, 96] + - [30, 152.987] + - - [64, 256, 1, 128, 96, 96, 160, 160] + - [9, 279.807] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [56, 462.133] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [56, 729.254] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [57, 1005.11] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [69, 1263.73] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [87, 1426.73] + - - [64, 512, 1, 64, 96, 96, 96, 96] + - [86, 327.937] + - - [64, 512, 1, 128, 96, 96, 160, 160] + - [56, 598.93] + - - [64, 512, 1, 256, 96, 96, 288, 288] + - [69, 1017.66] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [56, 1569.43] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [95, 2130.17] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [69, 2585.39] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [56, 2898.37] + - - [64, 1024, 1, 64, 96, 96, 96, 96] + - [69, 678.363] + - - [64, 1024, 1, 128, 96, 96, 160, 160] + - [79, 1226.4] + - - [64, 1024, 1, 256, 96, 96, 288, 288] + - [15, 2020.87] + - - [64, 1024, 1, 512, 96, 96, 544, 544] + - [79, 3073.6] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [79, 4307.93] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [56, 5174.17] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [69, 5820.88] + - - [64, 2048, 1, 64, 96, 96, 96, 96] + - [95, 1267.74] + - - [64, 2048, 1, 128, 96, 96, 160, 160] + - [86, 2280.75] + - - [64, 2048, 1, 256, 96, 96, 288, 288] + - [79, 3761.3] + - - [64, 2048, 1, 512, 96, 96, 544, 544] + - [95, 5872.85] + - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] + - [69, 8268.72] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [62, 10099.5] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [69, 11490.8] + - - [64, 4096, 1, 64, 96, 96, 96, 96] + - [32, 2119.67] + - - [64, 4096, 1, 128, 96, 96, 160, 160] + - [2, 3783.77] + - - [64, 4096, 1, 256, 96, 96, 288, 288] + - [33, 6294.81] + - - [64, 4096, 1, 512, 96, 96, 544, 544] + - [47, 9922.23] + - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] + - [21, 13753.9] + - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] + - [50, 17211.9] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [21, 18627.1] + - - [128, 64, 1, 64, 160, 160, 128, 96] + - [4, 64.0156] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [27, 122.943] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [27, 212.37] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [79, 326.837] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [57, 482.27] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [63, 600.667] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [96, 681.212] + - - [128, 128, 1, 64, 160, 160, 128, 96] + - [25, 182.742] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [31, 334.154] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [96, 534.715] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [87, 821.929] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [63, 1099.14] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [87, 1311.03] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [63, 1468.5] + - - [128, 256, 1, 64, 160, 160, 128, 96] + - [31, 395.316] + - - [128, 256, 1, 128, 160, 160, 160, 160] + - [29, 697.076] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [87, 1106.39] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [56, 1663.09] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [96, 2274.88] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [87, 2670.79] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [96, 2961.36] + - - [128, 512, 1, 64, 160, 160, 128, 96] + - [57, 807.529] + - - [128, 512, 1, 128, 160, 160, 160, 160] + - [87, 1425.43] + - - [128, 512, 1, 256, 160, 160, 288, 288] + - [10, 2310.91] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [69, 3390.72] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [87, 4593.04] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [57, 5393.53] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [69, 5953.86] + - - [128, 1024, 1, 64, 160, 160, 128, 96] + - [86, 1467.31] + - - [128, 1024, 1, 128, 160, 160, 160, 160] + - [80, 2616.13] + - - [128, 1024, 1, 256, 160, 160, 288, 288] + - [57, 4378.2] + - - [128, 1024, 1, 512, 160, 160, 544, 544] + - [70, 6425.61] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [70, 8702.45] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [87, 10634.9] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [57, 11760.6] + - - [128, 2048, 1, 64, 160, 160, 128, 96] + - [56, 2577.15] + - - [128, 2048, 1, 128, 160, 160, 160, 160] + - [73, 4849.64] + - - [128, 2048, 1, 256, 160, 160, 288, 288] + - [69, 7956.94] + - - [128, 2048, 1, 512, 160, 160, 544, 544] + - [70, 12220.5] + - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] + - [96, 16674.0] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [96, 20479.6] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [95, 23351.4] + - - [128, 4096, 1, 64, 160, 160, 128, 96] + - [56, 4976.19] + - - [128, 4096, 1, 128, 160, 160, 160, 160] + - [54, 9346.64] + - - [128, 4096, 1, 256, 160, 160, 288, 288] + - [19, 14421.2] + - - [128, 4096, 1, 512, 160, 160, 544, 544] + - [97, 22911.9] + - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] + - [52, 30915.1] + - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] + - [2, 36468.6] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [2, 38064.5] + - - [256, 64, 1, 64, 288, 288, 256, 96] + - [2, 162.823] + - - [256, 64, 1, 128, 288, 288, 256, 160] + - [15, 268.419] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [56, 460.356] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [56, 746.185] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [56, 1014.71] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [96, 1275.69] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [69, 1422.28] + - - [256, 128, 1, 64, 288, 288, 256, 96] + - [43, 391.698] + - - [256, 128, 1, 128, 288, 288, 256, 160] + - [31, 694.306] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [56, 1155.94] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [56, 1719.68] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [69, 2227.02] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [79, 2702.74] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [95, 2980.24] + - - [256, 256, 1, 64, 288, 288, 256, 96] + - [96, 754.782] + - - [256, 256, 1, 128, 288, 288, 256, 160] + - [69, 1347.57] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [69, 2257.73] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [86, 3482.93] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [69, 4517.6] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [96, 5444.5] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [86, 6009.98] + - - [256, 512, 1, 64, 288, 288, 256, 96] + - [28, 1443.08] + - - [256, 512, 1, 128, 288, 288, 256, 160] + - [95, 2505.93] + - - [256, 512, 1, 256, 288, 288, 288, 288] + - [87, 4221.21] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [87, 6617.58] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [56, 8807.54] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [96, 10603.8] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [80, 11754.9] + - - [256, 1024, 1, 64, 288, 288, 256, 96] + - [11, 2571.61] + - - [256, 1024, 1, 128, 288, 288, 256, 160] + - [12, 4605.96] + - - [256, 1024, 1, 256, 288, 288, 288, 288] + - [69, 7776.23] + - - [256, 1024, 1, 512, 288, 288, 544, 544] + - [69, 12336.2] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [70, 16512.0] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [86, 20510.8] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [95, 23165.5] + - - [256, 2048, 1, 64, 288, 288, 256, 96] + - [95, 4740.67] + - - [256, 2048, 1, 128, 288, 288, 256, 160] + - [35, 8823.15] + - - [256, 2048, 1, 256, 288, 288, 288, 288] + - [24, 14671.9] + - - [256, 2048, 1, 512, 288, 288, 544, 544] + - [38, 22021.0] + - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] + - [88, 29947.7] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [44, 36415.3] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [2, 38295.3] + - - [256, 4096, 1, 64, 288, 288, 256, 96] + - [86, 9478.65] + - - [256, 4096, 1, 128, 288, 288, 256, 160] + - [89, 15957.4] + - - [256, 4096, 1, 256, 288, 288, 288, 288] + - [19, 22608.9] + - - [256, 4096, 1, 512, 288, 288, 544, 544] + - [76, 29594.4] + - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] + - [61, 35018.7] + - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] + - [91, 37462.2] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [99, 40120.4] + - - [384, 64, 1, 64, 416, 416, 384, 96] + - [14, 251.017] + - - [384, 64, 1, 128, 416, 416, 384, 160] + - [26, 431.394] + - - [384, 64, 1, 256, 416, 416, 384, 288] + - [98, 720.836] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [56, 1145.88] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [56, 1550.19] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [56, 1936.95] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [56, 2150.38] + - - [384, 128, 1, 64, 416, 416, 384, 96] + - [87, 596.689] + - - [384, 128, 1, 128, 416, 416, 384, 160] + - [87, 1058.64] + - - [384, 128, 1, 256, 416, 416, 384, 288] + - [13, 1729.85] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [69, 2614.63] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [87, 3348.52] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [62, 4071.32] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [69, 4475.92] + - - [384, 256, 1, 64, 416, 416, 384, 96] + - [57, 1101.83] + - - [384, 256, 1, 128, 416, 416, 384, 160] + - [70, 2065.83] + - - [384, 256, 1, 256, 416, 416, 384, 288] + - [56, 3419.27] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [70, 5109.31] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [86, 6658.08] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [87, 8087.04] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [87, 8924.45] + - - [384, 512, 1, 64, 416, 416, 384, 96] + - [70, 1954.78] + - - [384, 512, 1, 128, 416, 416, 384, 160] + - [63, 3559.52] + - - [384, 512, 1, 256, 416, 416, 384, 288] + - [63, 6024.14] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [57, 9550.6] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [70, 12682.8] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [80, 15623.1] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [79, 17438.4] + - - [384, 1024, 1, 64, 416, 416, 384, 96] + - [96, 4041.42] + - - [384, 1024, 1, 128, 416, 416, 384, 160] + - [42, 7122.09] + - - [384, 1024, 1, 256, 416, 416, 384, 288] + - [53, 11299.1] + - - [384, 1024, 1, 512, 416, 416, 544, 544] + - [23, 17297.6] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [5, 22698.8] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [53, 27014.7] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [51, 28693.8] + - - [384, 2048, 1, 64, 416, 416, 384, 96] + - [87, 7737.38] + - - [384, 2048, 1, 128, 416, 416, 384, 160] + - [89, 13477.5] + - - [384, 2048, 1, 256, 416, 416, 384, 288] + - [52, 20140.8] + - - [384, 2048, 1, 512, 416, 416, 544, 544] + - [76, 27226.6] + - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] + - [75, 34287.3] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [75, 36698.3] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [66, 39379.8] + - - [384, 4096, 1, 64, 416, 416, 384, 96] + - [94, 12294.0] + - - [384, 4096, 1, 128, 416, 416, 384, 160] + - [92, 19603.4] + - - [384, 4096, 1, 256, 416, 416, 384, 288] + - [67, 26502.6] + - - [384, 4096, 1, 512, 416, 416, 544, 544] + - [3, 32352.0] + - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] + - [24, 35150.1] + - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] + - [5, 38319.7] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [0, 39902.5] + - - [768, 64, 1, 64, 800, 800, 768, 96] + - [3, 521.336] + - - [768, 64, 1, 128, 800, 800, 768, 160] + - [69, 911.937] + - - [768, 64, 1, 256, 800, 800, 768, 288] + - [95, 1548.48] + - - [768, 64, 1, 512, 800, 800, 768, 544] + - [69, 2366.99] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [95, 3198.91] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [86, 3854.62] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [95, 4325.24] + - - [768, 128, 1, 64, 800, 800, 768, 96] + - [56, 1136.87] + - - [768, 128, 1, 128, 800, 800, 768, 160] + - [70, 2036.41] + - - [768, 128, 1, 256, 800, 800, 768, 288] + - [17, 3333.67] + - - [768, 128, 1, 512, 800, 800, 768, 544] + - [56, 4946.6] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [87, 6696.17] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [87, 7940.01] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [70, 8809.26] + - - [768, 256, 1, 64, 800, 800, 768, 96] + - [57, 1955.69] + - - [768, 256, 1, 128, 800, 800, 768, 160] + - [56, 3699.22] + - - [768, 256, 1, 256, 800, 800, 768, 288] + - [40, 6082.41] + - - [768, 256, 1, 512, 800, 800, 768, 544] + - [62, 9286.28] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [87, 12858.6] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [62, 15355.6] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [62, 17321.8] + - - [768, 512, 1, 64, 800, 800, 768, 96] + - [69, 3629.35] + - - [768, 512, 1, 128, 800, 800, 768, 160] + - [41, 7139.24] + - - [768, 512, 1, 256, 800, 800, 768, 288] + - [8, 10793.8] + - - [768, 512, 1, 512, 800, 800, 768, 544] + - [71, 16843.2] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [46, 22742.4] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [2, 26940.6] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [24, 28935.1] + - - [768, 1024, 1, 64, 800, 800, 768, 96] + - [86, 8121.94] + - - [768, 1024, 1, 128, 800, 800, 768, 160] + - [89, 13502.8] + - - [768, 1024, 1, 256, 800, 800, 768, 288] + - [91, 20195.3] + - - [768, 1024, 1, 512, 800, 800, 768, 544] + - [91, 27929.1] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [91, 33825.1] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [91, 36410.4] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [91, 39323.6] + - - [768, 2048, 1, 64, 800, 800, 768, 96] + - [84, 12286.5] + - - [768, 2048, 1, 128, 800, 800, 768, 160] + - [35, 19217.9] + - - [768, 2048, 1, 256, 800, 800, 768, 288] + - [50, 25969.3] + - - [768, 2048, 1, 512, 800, 800, 768, 544] + - [75, 33185.3] + - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] + - [75, 35955.2] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [75, 39112.5] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [75, 41087.1] + - - [768, 4096, 1, 64, 800, 800, 768, 96] + - [90, 17225.1] + - - [768, 4096, 1, 128, 800, 800, 768, 160] + - [93, 24667.9] + - - [768, 4096, 1, 256, 800, 800, 768, 288] + - [93, 31551.0] + - - [768, 4096, 1, 512, 800, 800, 768, 544] + - [91, 35398.1] + - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] + - [91, 38977.7] + - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] + - [75, 41188.8] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [75, 41797.4] + - - [1536, 64, 1, 64, 1568, 1568, 1536, 96] + - [3, 962.585] + - - [1536, 64, 1, 128, 1568, 1568, 1536, 160] + - [95, 1767.76] + - - [1536, 64, 1, 256, 1568, 1568, 1536, 288] + - [79, 3030.94] + - - [1536, 64, 1, 512, 1568, 1568, 1536, 544] + - [56, 4549.96] + - - [1536, 64, 1, 1024, 1568, 1568, 1536, 1056] + - [79, 6253.55] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [86, 7565.26] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [80, 8153.03] + - - [1536, 128, 1, 64, 1568, 1568, 1536, 96] + - [30, 1928.43] + - - [1536, 128, 1, 128, 1568, 1568, 1536, 160] + - [62, 3691.63] + - - [1536, 128, 1, 256, 1568, 1568, 1536, 288] + - [62, 6226.1] + - - [1536, 128, 1, 512, 1568, 1568, 1536, 544] + - [86, 9224.16] + - - [1536, 128, 1, 1024, 1568, 1568, 1536, 1056] + - [69, 12645.3] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [96, 15299.0] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [70, 17380.5] + - - [1536, 256, 1, 64, 1568, 1568, 1536, 96] + - [48, 3743.81] + - - [1536, 256, 1, 128, 1568, 1568, 1536, 160] + - [34, 6733.33] + - - [1536, 256, 1, 256, 1568, 1568, 1536, 288] + - [71, 11227.2] + - - [1536, 256, 1, 512, 1568, 1568, 1536, 544] + - [44, 16777.3] + - - [1536, 256, 1, 1024, 1568, 1568, 1536, 1056] + - [88, 22415.7] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [3, 27039.1] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [22, 28858.4] + - - [1536, 512, 1, 64, 1568, 1568, 1536, 96] + - [95, 8089.34] + - - [1536, 512, 1, 128, 1568, 1568, 1536, 160] + - [72, 13493.8] + - - [1536, 512, 1, 256, 1568, 1568, 1536, 288] + - [82, 19548.2] + - - [1536, 512, 1, 512, 1568, 1568, 1536, 544] + - [91, 28024.4] + - - [1536, 512, 1, 1024, 1568, 1568, 1536, 1056] + - [75, 33843.5] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [91, 36464.0] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [60, 39425.1] + - - [1536, 1024, 1, 64, 1568, 1568, 1536, 96] + - [78, 12372.6] + - - [1536, 1024, 1, 128, 1568, 1568, 1536, 160] + - [36, 19315.7] + - - [1536, 1024, 1, 256, 1568, 1568, 1536, 288] + - [36, 26011.2] + - - [1536, 1024, 1, 512, 1568, 1568, 1536, 544] + - [75, 33351.6] + - - [1536, 1024, 1, 1024, 1568, 1568, 1536, 1056] + - [75, 36068.0] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [75, 39152.5] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [75, 41098.1] + - - [1536, 2048, 1, 64, 1568, 1568, 1536, 96] + - [78, 16872.9] + - - [1536, 2048, 1, 128, 1568, 1568, 1536, 160] + - [78, 24683.0] + - - [1536, 2048, 1, 256, 1568, 1568, 1536, 288] + - [93, 31600.6] + - - [1536, 2048, 1, 512, 1568, 1568, 1536, 544] + - [75, 35412.1] + - - [1536, 2048, 1, 1024, 1568, 1568, 1536, 1056] + - [75, 39036.7] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [75, 41217.2] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [75, 41741.7] + - - [1536, 4096, 1, 64, 1568, 1568, 1536, 96] + - [98, 21902.4] + - - [1536, 4096, 1, 128, 1568, 1568, 1536, 160] + - [36, 29299.9] + - - [1536, 4096, 1, 256, 1568, 1568, 1536, 288] + - [36, 33728.7] + - - [1536, 4096, 1, 512, 1568, 1568, 1536, 544] + - [75, 38410.1] + - - [1536, 4096, 1, 1024, 1568, 1568, 1536, 1056] + - [75, 40913.8] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] + - [75, 41475.7] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [61, 41726.0] + - - [3072, 64, 1, 64, 3104, 3104, 3072, 96] + - [48, 1550.39] + - - [3072, 64, 1, 128, 3104, 3104, 3072, 160] + - [33, 3030.93] + - - [3072, 64, 1, 256, 3104, 3104, 3072, 288] + - [26, 5128.05] + - - [3072, 64, 1, 512, 3104, 3104, 3072, 544] + - [21, 7643.38] + - - [3072, 64, 1, 1024, 3104, 3104, 3072, 1056] + - [50, 10644.3] + - - [3072, 64, 1, 2048, 3104, 3104, 3072, 2080] + - [9, 12953.7] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [21, 14306.7] + - - [3072, 128, 1, 64, 3104, 3104, 3072, 96] + - [55, 3807.81] + - - [3072, 128, 1, 128, 3104, 3104, 3072, 160] + - [9, 6232.25] + - - [3072, 128, 1, 256, 3104, 3104, 3072, 288] + - [52, 10402.4] + - - [3072, 128, 1, 512, 3104, 3104, 3072, 544] + - [37, 17426.3] + - - [3072, 128, 1, 1024, 3104, 3104, 3072, 1056] + - [6, 22940.6] + - - [3072, 128, 1, 2048, 3104, 3104, 3072, 2080] + - [39, 27226.6] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [45, 28950.2] + - - [3072, 256, 1, 64, 3104, 3104, 3072, 96] + - [86, 7738.57] + - - [3072, 256, 1, 128, 3104, 3104, 3072, 160] + - [77, 13586.6] + - - [3072, 256, 1, 256, 3104, 3104, 3072, 288] + - [35, 20181.1] + - - [3072, 256, 1, 512, 3104, 3104, 3072, 544] + - [100, 27539.4] + - - [3072, 256, 1, 1024, 3104, 3104, 3072, 1056] + - [76, 34329.7] + - - [3072, 256, 1, 2048, 3104, 3104, 3072, 2080] + - [60, 36726.7] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [82, 39650.8] + - - [3072, 512, 1, 64, 3104, 3104, 3072, 96] + - [77, 11863.7] + - - [3072, 512, 1, 128, 3104, 3104, 3072, 160] + - [77, 18996.7] + - - [3072, 512, 1, 256, 3104, 3104, 3072, 288] + - [50, 26614.7] + - - [3072, 512, 1, 512, 3104, 3104, 3072, 544] + - [76, 32837.5] + - - [3072, 512, 1, 1024, 3104, 3104, 3072, 1056] + - [75, 35822.5] + - - [3072, 512, 1, 2048, 3104, 3104, 3072, 2080] + - [60, 39309.2] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [91, 40947.9] + - - [3072, 1024, 1, 64, 3104, 3104, 3072, 96] + - [74, 17251.6] + - - [3072, 1024, 1, 128, 3104, 3104, 3072, 160] + - [93, 25246.3] + - - [3072, 1024, 1, 256, 3104, 3104, 3072, 288] + - [20, 31507.8] + - - [3072, 1024, 1, 512, 3104, 3104, 3072, 544] + - [75, 35200.1] + - - [3072, 1024, 1, 1024, 3104, 3104, 3072, 1056] + - [75, 39209.7] + - - [3072, 1024, 1, 2048, 3104, 3104, 3072, 2080] + - [75, 41255.5] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [60, 41762.8] + - - [3072, 2048, 1, 64, 3104, 3104, 3072, 96] + - [81, 21704.1] + - - [3072, 2048, 1, 128, 3104, 3104, 3072, 160] + - [85, 29683.3] + - - [3072, 2048, 1, 256, 3104, 3104, 3072, 288] + - [82, 33793.9] + - - [3072, 2048, 1, 512, 3104, 3104, 3072, 544] + - [99, 38315.6] + - - [3072, 2048, 1, 1024, 3104, 3104, 3072, 1056] + - [75, 40925.8] + - - [3072, 2048, 1, 2048, 3104, 3104, 3072, 2080] + - [60, 41684.5] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [75, 41418.4] + - - [3072, 4096, 1, 64, 3104, 3104, 3072, 96] + - [64, 6641.44] + - - [3072, 4096, 1, 128, 3104, 3104, 3072, 160] + - [65, 12983.6] + - - [3072, 4096, 1, 256, 3104, 3104, 3072, 288] + - [65, 25239.4] + - - [3072, 4096, 1, 512, 3104, 3104, 3072, 544] + - [63, 38579.9] + - - [3072, 4096, 1, 1024, 3104, 3104, 3072, 1056] + - [67, 40575.1] + - - [3072, 4096, 1, 2048, 3104, 3104, 3072, 2080] + - [75, 41624.3] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [76, 41686.9] + - - [4096, 64, 1, 64, 4128, 4128, 4096, 96] + - [9, 2286.04] + - - [4096, 64, 1, 128, 4128, 4128, 4096, 160] + - [7, 4202.71] + - - [4096, 64, 1, 256, 4128, 4128, 4096, 288] + - [33, 7061.14] + - - [4096, 64, 1, 512, 4128, 4128, 4096, 544] + - [47, 11003.3] + - - [4096, 64, 1, 1024, 4128, 4128, 4096, 1056] + - [14, 14683.8] + - - [4096, 64, 1, 2048, 4128, 4128, 4096, 2080] + - [3, 17541.4] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [2, 18721.0] + - - [4096, 128, 1, 64, 4128, 4128, 4096, 96] + - [15, 4943.93] + - - [4096, 128, 1, 128, 4128, 4128, 4096, 160] + - [49, 9607.64] + - - [4096, 128, 1, 256, 4128, 4128, 4096, 288] + - [35, 14687.9] + - - [4096, 128, 1, 512, 4128, 4128, 4096, 544] + - [97, 22601.3] + - - [4096, 128, 1, 1024, 4128, 4128, 4096, 1056] + - [5, 30625.9] + - - [4096, 128, 1, 2048, 4128, 4128, 4096, 2080] + - [3, 36577.9] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [2, 38162.6] + - - [4096, 256, 1, 64, 4128, 4128, 4096, 96] + - [69, 9396.37] + - - [4096, 256, 1, 128, 4128, 4128, 4096, 160] + - [72, 16176.7] + - - [4096, 256, 1, 256, 4128, 4128, 4096, 288] + - [35, 22869.0] + - - [4096, 256, 1, 512, 4128, 4128, 4096, 544] + - [66, 30585.8] + - - [4096, 256, 1, 1024, 4128, 4128, 4096, 1056] + - [91, 35786.7] + - - [4096, 256, 1, 2048, 4128, 4128, 4096, 2080] + - [75, 37887.2] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [99, 40129.4] + - - [4096, 512, 1, 64, 4128, 4128, 4096, 96] + - [52, 14114.8] + - - [4096, 512, 1, 128, 4128, 4128, 4096, 160] + - [59, 21877.4] + - - [4096, 512, 1, 256, 4128, 4128, 4096, 288] + - [83, 28425.4] + - - [4096, 512, 1, 512, 4128, 4128, 4096, 544] + - [60, 35024.4] + - - [4096, 512, 1, 1024, 4128, 4128, 4096, 1056] + - [91, 37284.7] + - - [4096, 512, 1, 2048, 4128, 4128, 4096, 2080] + - [75, 40275.0] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [60, 41785.8] + - - [4096, 1024, 1, 64, 4128, 4128, 4096, 96] + - [35, 18693.3] + - - [4096, 1024, 1, 128, 4128, 4128, 4096, 160] + - [85, 26867.8] + - - [4096, 1024, 1, 256, 4128, 4128, 4096, 288] + - [35, 33306.7] + - - [4096, 1024, 1, 512, 4128, 4128, 4096, 544] + - [60, 36590.3] + - - [4096, 1024, 1, 1024, 4128, 4128, 4096, 1056] + - [60, 39935.4] + - - [4096, 1024, 1, 2048, 4128, 4128, 4096, 2080] + - [75, 41775.6] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [35, 41003.6] + - - [4096, 2048, 1, 64, 4128, 4128, 4096, 96] + - [90, 22677.7] + - - [4096, 2048, 1, 128, 4128, 4128, 4096, 160] + - [68, 30743.4] + - - [4096, 2048, 1, 256, 4128, 4128, 4096, 288] + - [50, 34783.9] + - - [4096, 2048, 1, 512, 4128, 4128, 4096, 544] + - [52, 38051.7] + - - [4096, 2048, 1, 1024, 4128, 4128, 4096, 1056] + - [56, 40550.6] + - - [4096, 2048, 1, 2048, 4128, 4128, 4096, 2080] + - [61, 41121.9] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [76, 42286.6] + - - [4096, 4096, 1, 64, 4128, 4128, 4096, 96] + - [58, 6883.54] + - - [4096, 4096, 1, 128, 4128, 4128, 4096, 160] + - [58, 13780.3] + - - [4096, 4096, 1, 256, 4128, 4128, 4096, 288] + - [65, 26062.2] + - - [4096, 4096, 1, 512, 4128, 4128, 4096, 544] + - [63, 39162.5] + - - [4096, 4096, 1, 1024, 4128, 4128, 4096, 1056] + - [3, 40101.9] + - - [4096, 4096, 1, 2048, 4128, 4128, 4096, 2080] + - [76, 41557.7] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [76, 41818.3] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_SB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_SB.yaml new file mode 100644 index 00000000000..43dcfd0ee64 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Ailk_Bljk_SB.yaml @@ -0,0 +1,310 @@ +- {MinimumRequiredVersion: 4.33.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 8 + LSPA: 8 + LSPB: 32 + LVCA: 32 + LVCB: 8 + LVPA: 8 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 512 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: false + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_SB_MT32x32x8_SN_ + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWforTLUandMI: false +- [2, 3, 0, 1] +- - - [126, 126, 2, 66, 126, 126, 126, 66] + - [0, 0] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_BBS_BH.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_BBS_BH.yaml new file mode 100644 index 00000000000..5fac07b633e --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_BBS_BH.yaml @@ -0,0 +1,21903 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [6, 31.3419] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [13, 53.7456] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [7, 93.4143] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [69, 146.041] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [64, 210.643] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [69, 274.093] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [69, 305.224] + - - [64, 128, 1, 64, 96, 96, 96, 128] + - [2, 56.1757] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [13, 103.758] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [13, 181.713] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [69, 299.529] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [64, 427.011] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [69, 542.794] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [77, 612.274] + - - [64, 256, 1, 64, 96, 96, 96, 256] + - [49, 134.519] + - - [64, 256, 1, 128, 96, 96, 160, 256] + - [6, 246.173] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [69, 422.09] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [69, 666.45] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [77, 932.586] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [73, 1148.38] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [64, 1273.73] + - - [64, 512, 1, 64, 96, 96, 96, 512] + - [46, 278.58] + - - [64, 512, 1, 128, 96, 96, 160, 512] + - [6, 510.504] + - - [64, 512, 1, 256, 96, 96, 288, 512] + - [20, 870.007] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [77, 1369.79] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [65, 1930.19] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [73, 2450.93] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [71, 2638.29] + - - [64, 1024, 1, 64, 96, 96, 96, 1024] + - [22, 579.724] + - - [64, 1024, 1, 128, 96, 96, 160, 1024] + - [77, 1053.98] + - - [64, 1024, 1, 256, 96, 96, 288, 1024] + - [77, 1859.59] + - - [64, 1024, 1, 512, 96, 96, 544, 1024] + - [69, 2902.88] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [69, 4025.49] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [69, 4916.23] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [76, 5422.94] + - - [64, 2048, 1, 64, 96, 96, 96, 2048] + - [46, 1229.65] + - - [64, 2048, 1, 128, 96, 96, 160, 2048] + - [21, 2227.16] + - - [64, 2048, 1, 256, 96, 96, 288, 2048] + - [20, 3710.96] + - - [64, 2048, 1, 512, 96, 96, 544, 2048] + - [68, 5620.51] + - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] + - [64, 7861.41] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [66, 9902.09] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [73, 9588.87] + - - [64, 4096, 1, 64, 96, 96, 96, 4096] + - [58, 2126.93] + - - [64, 4096, 1, 128, 96, 96, 160, 4096] + - [21, 3867.96] + - - [64, 4096, 1, 256, 96, 96, 288, 4096] + - [46, 6570.3] + - - [64, 4096, 1, 512, 96, 96, 544, 4096] + - [5, 10059.1] + - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] + - [56, 14007.3] + - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] + - [21, 17248.3] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [77, 14709.2] + - - [128, 64, 1, 64, 160, 160, 96, 96] + - [10, 66.475] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [13, 116.405] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [6, 186.812] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [6, 292.246] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [64, 418.656] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [66, 539.218] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [66, 607.452] + - - [128, 128, 1, 64, 160, 160, 96, 128] + - [3, 139.847] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [46, 265.867] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [46, 450.662] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [7, 688.155] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [69, 958.698] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [69, 1167.31] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [66, 1261.47] + - - [128, 256, 1, 64, 160, 160, 96, 256] + - [8, 325.746] + - - [128, 256, 1, 128, 160, 160, 160, 256] + - [46, 588.426] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [13, 1019.4] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [46, 1515.01] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [13, 1995.98] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [64, 2391.45] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [74, 2590.03] + - - [128, 512, 1, 64, 160, 160, 96, 512] + - [22, 728.81] + - - [128, 512, 1, 128, 160, 160, 160, 512] + - [22, 1304.4] + - - [128, 512, 1, 256, 160, 160, 288, 512] + - [22, 2133.42] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [74, 3046.25] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [74, 4068.44] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [71, 4851.89] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [76, 5304.0] + - - [128, 1024, 1, 64, 160, 160, 96, 1024] + - [22, 1411.51] + - - [128, 1024, 1, 128, 160, 160, 160, 1024] + - [13, 2529.74] + - - [128, 1024, 1, 256, 160, 160, 288, 1024] + - [22, 4143.03] + - - [128, 1024, 1, 512, 160, 160, 544, 1024] + - [77, 6194.28] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [77, 8205.52] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [71, 9977.9] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [77, 10612.6] + - - [128, 2048, 1, 64, 160, 160, 96, 2048] + - [13, 2663.9] + - - [128, 2048, 1, 128, 160, 160, 160, 2048] + - [58, 4954.15] + - - [128, 2048, 1, 256, 160, 160, 288, 2048] + - [13, 8181.04] + - - [128, 2048, 1, 512, 160, 160, 544, 2048] + - [69, 12073.2] + - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] + - [70, 16273.8] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [66, 19675.0] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [63, 21060.4] + - - [128, 4096, 1, 64, 160, 160, 96, 4096] + - [58, 4735.98] + - - [128, 4096, 1, 128, 160, 160, 160, 4096] + - [6, 8661.44] + - - [128, 4096, 1, 256, 160, 160, 288, 4096] + - [55, 14341.1] + - - [128, 4096, 1, 512, 160, 160, 544, 4096] + - [6, 21721.6] + - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] + - [33, 28980.9] + - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] + - [45, 35093.1] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [45, 33039.3] + - - [256, 64, 1, 64, 288, 288, 96, 96] + - [2, 142.141] + - - [256, 64, 1, 128, 288, 288, 160, 160] + - [21, 250.736] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [6, 408.125] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [66, 643.002] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [66, 917.088] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [66, 1139.83] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [64, 1242.11] + - - [256, 128, 1, 64, 288, 288, 96, 128] + - [46, 308.77] + - - [256, 128, 1, 128, 288, 288, 160, 160] + - [22, 587.191] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [46, 1020.52] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [22, 1517.48] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [46, 2000.74] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [74, 2370.92] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [66, 2572.11] + - - [256, 256, 1, 64, 288, 288, 96, 256] + - [58, 725.784] + - - [256, 256, 1, 128, 288, 288, 160, 256] + - [22, 1304.6] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [13, 2125.58] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [69, 3035.5] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [69, 4017.29] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [63, 4810.84] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [64, 5195.3] + - - [256, 512, 1, 64, 288, 288, 96, 512] + - [46, 1486.55] + - - [256, 512, 1, 128, 288, 288, 160, 512] + - [44, 2648.78] + - - [256, 512, 1, 256, 288, 288, 288, 512] + - [46, 4331.84] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [66, 6314.94] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [74, 8129.97] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [64, 9840.73] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [63, 10545.1] + - - [256, 1024, 1, 64, 288, 288, 96, 1024] + - [58, 2796.69] + - - [256, 1024, 1, 128, 288, 288, 160, 1024] + - [46, 4976.93] + - - [256, 1024, 1, 256, 288, 288, 288, 1024] + - [7, 8191.0] + - - [256, 1024, 1, 512, 288, 288, 544, 1024] + - [13, 12053.7] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [74, 16096.1] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [76, 19227.5] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [66, 20928.2] + - - [256, 2048, 1, 64, 288, 288, 96, 2048] + - [59, 4436.07] + - - [256, 2048, 1, 128, 288, 288, 160, 2048] + - [20, 8234.22] + - - [256, 2048, 1, 256, 288, 288, 288, 2048] + - [12, 14061.6] + - - [256, 2048, 1, 512, 288, 288, 544, 2048] + - [12, 21365.4] + - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] + - [11, 27006.9] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [12, 34570.0] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [31, 37368.4] + - - [256, 4096, 1, 64, 288, 288, 96, 4096] + - [32, 7932.49] + - - [256, 4096, 1, 128, 288, 288, 160, 4096] + - [38, 14082.2] + - - [256, 4096, 1, 256, 288, 288, 288, 4096] + - [16, 21178.4] + - - [256, 4096, 1, 512, 288, 288, 544, 4096] + - [18, 27895.2] + - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] + - [38, 33316.0] + - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] + - [3, 35361.2] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [35, 36518.4] + - - [384, 64, 1, 64, 416, 416, 96, 96] + - [42, 228.084] + - - [384, 64, 1, 128, 416, 416, 160, 160] + - [6, 402.27] + - - [384, 64, 1, 256, 416, 416, 288, 288] + - [46, 650.82] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [69, 1008.89] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [22, 1379.02] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [69, 1707.55] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [66, 1885.58] + - - [384, 128, 1, 64, 416, 416, 96, 128] + - [34, 496.798] + - - [384, 128, 1, 128, 416, 416, 160, 160] + - [46, 947.227] + - - [384, 128, 1, 256, 416, 416, 288, 288] + - [7, 1500.65] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [69, 2297.2] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [66, 3037.33] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [22, 3580.67] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [63, 3860.61] + - - [384, 256, 1, 64, 416, 416, 96, 256] + - [22, 1047.01] + - - [384, 256, 1, 128, 416, 416, 160, 256] + - [22, 1884.52] + - - [384, 256, 1, 256, 416, 416, 288, 288] + - [34, 3214.44] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [46, 4711.82] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [63, 6166.97] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [66, 7336.17] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [66, 7785.41] + - - [384, 512, 1, 64, 416, 416, 96, 512] + - [46, 2042.35] + - - [384, 512, 1, 128, 416, 416, 160, 512] + - [7, 3652.51] + - - [384, 512, 1, 256, 416, 416, 288, 512] + - [22, 6018.37] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [69, 9206.45] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [64, 12175.1] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [74, 14334.9] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [66, 15369.6] + - - [384, 1024, 1, 64, 416, 416, 96, 1024] + - [45, 3663.68] + - - [384, 1024, 1, 128, 416, 416, 160, 1024] + - [22, 6545.93] + - - [384, 1024, 1, 256, 416, 416, 288, 1024] + - [57, 10981.1] + - - [384, 1024, 1, 512, 416, 416, 544, 1024] + - [20, 16102.3] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [22, 21679.5] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [57, 26211.0] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [22, 28380.4] + - - [384, 2048, 1, 64, 416, 416, 96, 2048] + - [49, 6213.02] + - - [384, 2048, 1, 128, 416, 416, 160, 2048] + - [39, 10702.0] + - - [384, 2048, 1, 256, 416, 416, 288, 2048] + - [39, 18051.4] + - - [384, 2048, 1, 512, 416, 416, 544, 2048] + - [49, 24073.5] + - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] + - [62, 30606.0] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [26, 34036.7] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [48, 37099.4] + - - [384, 4096, 1, 64, 416, 416, 96, 4096] + - [51, 10422.8] + - - [384, 4096, 1, 128, 416, 416, 160, 4096] + - [42, 16622.1] + - - [384, 4096, 1, 256, 416, 416, 288, 4096] + - [42, 23483.8] + - - [384, 4096, 1, 512, 416, 416, 544, 4096] + - [51, 30116.2] + - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] + - [2, 33413.8] + - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] + - [2, 36691.6] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [16, 37313.6] + - - [768, 64, 1, 64, 800, 800, 96, 96] + - [39, 454.782] + - - [768, 64, 1, 128, 800, 800, 160, 160] + - [6, 810.128] + - - [768, 64, 1, 256, 800, 800, 288, 288] + - [46, 1319.24] + - - [768, 64, 1, 512, 800, 800, 544, 544] + - [66, 2038.54] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [71, 2794.96] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [77, 3451.87] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [66, 3760.23] + - - [768, 128, 1, 64, 800, 800, 96, 128] + - [22, 1034.44] + - - [768, 128, 1, 128, 800, 800, 160, 160] + - [46, 1958.73] + - - [768, 128, 1, 256, 800, 800, 288, 288] + - [58, 3195.66] + - - [768, 128, 1, 512, 800, 800, 544, 544] + - [46, 4692.49] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [77, 6136.14] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [63, 7270.21] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [64, 7702.75] + - - [768, 256, 1, 64, 800, 800, 96, 256] + - [56, 2101.71] + - - [768, 256, 1, 128, 800, 800, 160, 256] + - [46, 3774.12] + - - [768, 256, 1, 256, 800, 800, 288, 288] + - [13, 6218.43] + - - [768, 256, 1, 512, 800, 800, 544, 544] + - [74, 9186.28] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [69, 12130.3] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [71, 14302.3] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [77, 15585.0] + - - [768, 512, 1, 64, 800, 800, 96, 512] + - [38, 3626.2] + - - [768, 512, 1, 128, 800, 800, 160, 512] + - [31, 6595.68] + - - [768, 512, 1, 256, 800, 800, 288, 512] + - [19, 10682.7] + - - [768, 512, 1, 512, 800, 800, 544, 544] + - [20, 15968.2] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [21, 21041.7] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [58, 26153.1] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [33, 28365.9] + - - [768, 1024, 1, 64, 800, 800, 96, 1024] + - [51, 5971.27] + - - [768, 1024, 1, 128, 800, 800, 160, 1024] + - [39, 10776.5] + - - [768, 1024, 1, 256, 800, 800, 288, 1024] + - [54, 18203.2] + - - [768, 1024, 1, 512, 800, 800, 544, 1024] + - [60, 24603.0] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [62, 31483.1] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [59, 34001.4] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [59, 37627.6] + - - [768, 2048, 1, 64, 800, 800, 96, 2048] + - [15, 10474.8] + - - [768, 2048, 1, 128, 800, 800, 160, 2048] + - [30, 16803.8] + - - [768, 2048, 1, 256, 800, 800, 288, 2048] + - [27, 24098.0] + - - [768, 2048, 1, 512, 800, 800, 544, 2048] + - [39, 31332.4] + - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] + - [18, 34262.5] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [3, 37580.7] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [18, 39058.2] + - - [768, 4096, 1, 64, 800, 800, 96, 4096] + - [47, 13920.1] + - - [768, 4096, 1, 128, 800, 800, 160, 4096] + - [16, 21061.5] + - - [768, 4096, 1, 256, 800, 800, 288, 4096] + - [51, 28053.6] + - - [768, 4096, 1, 512, 800, 800, 544, 4096] + - [40, 32705.5] + - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] + - [16, 37264.6] + - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] + - [18, 39923.5] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [41, 40574.3] + - - [1536, 64, 1, 64, 1568, 1568, 96, 96] + - [46, 963.47] + - - [1536, 64, 1, 128, 1568, 1568, 160, 160] + - [22, 1623.18] + - - [1536, 64, 1, 256, 1568, 1568, 288, 288] + - [58, 2724.16] + - - [1536, 64, 1, 512, 1568, 1568, 544, 544] + - [5, 4143.21] + - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] + - [69, 5591.48] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [69, 6922.48] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [64, 7706.44] + - - [1536, 128, 1, 64, 1568, 1568, 96, 128] + - [42, 1887.63] + - - [1536, 128, 1, 128, 1568, 1568, 160, 160] + - [22, 3597.17] + - - [1536, 128, 1, 256, 1568, 1568, 288, 288] + - [22, 5963.48] + - - [1536, 128, 1, 512, 1568, 1568, 544, 544] + - [46, 8840.97] + - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] + - [64, 11817.7] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [74, 14597.9] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [64, 15648.9] + - - [1536, 256, 1, 64, 1568, 1568, 96, 256] + - [55, 3489.92] + - - [1536, 256, 1, 128, 1568, 1568, 160, 256] + - [45, 6316.72] + - - [1536, 256, 1, 256, 1568, 1568, 288, 288] + - [33, 10590.6] + - - [1536, 256, 1, 512, 1568, 1568, 544, 544] + - [45, 16378.7] + - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] + - [46, 21949.0] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [45, 26374.1] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [43, 28173.3] + - - [1536, 512, 1, 64, 1568, 1568, 96, 512] + - [24, 6168.85] + - - [1536, 512, 1, 128, 1568, 1568, 160, 512] + - [24, 10743.1] + - - [1536, 512, 1, 256, 1568, 1568, 288, 512] + - [42, 17652.5] + - - [1536, 512, 1, 512, 1568, 1568, 544, 544] + - [28, 24610.6] + - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] + - [24, 31603.0] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [24, 34763.2] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [47, 37426.5] + - - [1536, 1024, 1, 64, 1568, 1568, 96, 1024] + - [38, 10887.3] + - - [1536, 1024, 1, 128, 1568, 1568, 160, 1024] + - [39, 16999.6] + - - [1536, 1024, 1, 256, 1568, 1568, 288, 1024] + - [41, 24244.5] + - - [1536, 1024, 1, 512, 1568, 1568, 544, 1024] + - [39, 31010.3] + - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] + - [39, 34354.6] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [18, 37523.8] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [18, 39432.3] + - - [1536, 2048, 1, 64, 1568, 1568, 96, 2048] + - [47, 13686.4] + - - [1536, 2048, 1, 128, 1568, 1568, 160, 2048] + - [42, 20792.8] + - - [1536, 2048, 1, 256, 1568, 1568, 288, 2048] + - [54, 28334.9] + - - [1536, 2048, 1, 512, 1568, 1568, 544, 2048] + - [16, 32858.9] + - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 2048] + - [54, 37154.2] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [18, 39958.7] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [18, 40498.2] + - - [1536, 4096, 1, 64, 1568, 1568, 96, 4096] + - [54, 16087.5] + - - [1536, 4096, 1, 128, 1568, 1568, 160, 4096] + - [39, 23586.3] + - - [1536, 4096, 1, 256, 1568, 1568, 288, 4096] + - [39, 29628.1] + - - [1536, 4096, 1, 512, 1568, 1568, 544, 4096] + - [39, 35542.2] + - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 4096] + - [54, 39207.7] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] + - [16, 40647.0] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [17, 41058.2] + - - [3072, 64, 1, 64, 3104, 3104, 96, 96] + - [19, 1678.62] + - - [3072, 64, 1, 128, 3104, 3104, 160, 160] + - [55, 2911.7] + - - [3072, 64, 1, 256, 3104, 3104, 288, 288] + - [57, 5019.12] + - - [3072, 64, 1, 512, 3104, 3104, 544, 544] + - [19, 7567.53] + - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] + - [6, 10735.7] + - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] + - [4, 12655.7] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [75, 10298.3] + - - [3072, 128, 1, 64, 3104, 3104, 96, 128] + - [55, 3220.61] + - - [3072, 128, 1, 128, 3104, 3104, 160, 160] + - [19, 6127.56] + - - [3072, 128, 1, 256, 3104, 3104, 288, 288] + - [33, 10076.4] + - - [3072, 128, 1, 512, 3104, 3104, 544, 544] + - [19, 14908.7] + - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] + - [20, 21530.0] + - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] + - [44, 26064.2] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [71, 21717.2] + - - [3072, 256, 1, 64, 3104, 3104, 96, 256] + - [24, 5793.24] + - - [3072, 256, 1, 128, 3104, 3104, 160, 256] + - [60, 11558.5] + - - [3072, 256, 1, 256, 3104, 3104, 288, 288] + - [49, 19328.6] + - - [3072, 256, 1, 512, 3104, 3104, 544, 544] + - [26, 26360.3] + - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] + - [47, 31997.2] + - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] + - [23, 34932.9] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [9, 36577.8] + - - [3072, 512, 1, 64, 3104, 3104, 96, 512] + - [38, 9776.01] + - - [3072, 512, 1, 128, 3104, 3104, 160, 512] + - [27, 17366.3] + - - [3072, 512, 1, 256, 3104, 3104, 288, 512] + - [39, 24780.2] + - - [3072, 512, 1, 512, 3104, 3104, 544, 544] + - [51, 31003.2] + - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] + - [18, 34266.9] + - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] + - [16, 37733.5] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [2, 38526.8] + - - [3072, 1024, 1, 64, 3104, 3104, 96, 1024] + - [39, 13986.8] + - - [3072, 1024, 1, 128, 3104, 3104, 160, 1024] + - [42, 21180.0] + - - [3072, 1024, 1, 256, 3104, 3104, 288, 1024] + - [51, 28100.6] + - - [3072, 1024, 1, 512, 3104, 3104, 544, 1024] + - [18, 33081.0] + - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] + - [18, 37274.5] + - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] + - [16, 39924.7] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [0, 39642.0] + - - [3072, 2048, 1, 64, 3104, 3104, 96, 2048] + - [39, 16096.5] + - - [3072, 2048, 1, 128, 3104, 3104, 160, 2048] + - [39, 23578.0] + - - [3072, 2048, 1, 256, 3104, 3104, 288, 2048] + - [16, 29621.6] + - - [3072, 2048, 1, 512, 3104, 3104, 544, 2048] + - [42, 35377.9] + - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 2048] + - [18, 39180.5] + - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] + - [18, 40769.0] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [52, 40033.8] + - - [3072, 4096, 1, 64, 3104, 3104, 96, 4096] + - [39, 16953.1] + - - [3072, 4096, 1, 128, 3104, 3104, 160, 4096] + - [51, 24535.6] + - - [3072, 4096, 1, 256, 3104, 3104, 288, 4096] + - [54, 31692.5] + - - [3072, 4096, 1, 512, 3104, 3104, 544, 4096] + - [39, 36868.0] + - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 4096] + - [51, 39602.3] + - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 4096] + - [53, 40759.8] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [59, 38496.4] + - - [4096, 64, 1, 64, 4128, 4128, 96, 96] + - [38, 1810.23] + - - [4096, 64, 1, 128, 4128, 4128, 160, 160] + - [12, 3632.24] + - - [4096, 64, 1, 256, 4128, 4128, 288, 288] + - [1, 5986.52] + - - [4096, 64, 1, 512, 4128, 4128, 544, 544] + - [50, 9024.89] + - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] + - [18, 12521.5] + - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] + - [1, 15810.3] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [67, 14132.5] + - - [4096, 128, 1, 64, 4128, 4128, 96, 128] + - [39, 5069.41] + - - [4096, 128, 1, 128, 4128, 4128, 160, 160] + - [61, 8922.89] + - - [4096, 128, 1, 256, 4128, 4128, 288, 288] + - [30, 14676.7] + - - [4096, 128, 1, 512, 4128, 4128, 544, 544] + - [16, 21728.7] + - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] + - [39, 29012.2] + - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] + - [2, 34178.2] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [72, 28705.1] + - - [4096, 256, 1, 64, 4128, 4128, 96, 256] + - [36, 8789.66] + - - [4096, 256, 1, 128, 4128, 4128, 160, 256] + - [50, 14625.4] + - - [4096, 256, 1, 256, 4128, 4128, 288, 288] + - [37, 21662.0] + - - [4096, 256, 1, 512, 4128, 4128, 544, 544] + - [29, 28523.6] + - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] + - [2, 33504.2] + - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] + - [18, 35874.5] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [14, 36589.4] + - - [4096, 512, 1, 64, 4128, 4128, 96, 512] + - [25, 11798.3] + - - [4096, 512, 1, 128, 4128, 4128, 160, 512] + - [16, 18593.6] + - - [4096, 512, 1, 256, 4128, 4128, 288, 512] + - [39, 25839.7] + - - [4096, 512, 1, 512, 4128, 4128, 544, 544] + - [16, 32826.1] + - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] + - [39, 35634.0] + - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] + - [18, 38853.7] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [61, 38321.1] + - - [4096, 1024, 1, 64, 4128, 4128, 96, 1024] + - [42, 14780.9] + - - [4096, 1024, 1, 128, 4128, 4128, 160, 1024] + - [16, 22119.9] + - - [4096, 1024, 1, 256, 4128, 4128, 288, 1024] + - [39, 28047.5] + - - [4096, 1024, 1, 512, 4128, 4128, 544, 1024] + - [42, 34050.3] + - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] + - [16, 38228.2] + - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] + - [18, 40709.9] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [42, 39644.9] + - - [4096, 2048, 1, 64, 4128, 4128, 96, 2048] + - [42, 16864.2] + - - [4096, 2048, 1, 128, 4128, 4128, 160, 2048] + - [39, 23391.0] + - - [4096, 2048, 1, 256, 4128, 4128, 288, 2048] + - [51, 30651.7] + - - [4096, 2048, 1, 512, 4128, 4128, 544, 2048] + - [39, 36170.6] + - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 2048] + - [16, 39707.2] + - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] + - [18, 40750.7] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [38, 40255.2] + - - [4096, 4096, 1, 64, 4128, 4128, 96, 4096] + - [46, 17374.8] + - - [4096, 4096, 1, 128, 4128, 4128, 160, 4096] + - [42, 24761.4] + - - [4096, 4096, 1, 256, 4128, 4128, 288, 4096] + - [39, 31607.4] + - - [4096, 4096, 1, 512, 4128, 4128, 544, 4096] + - [39, 37076.6] + - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 4096] + - [51, 39402.8] + - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 4096] + - [18, 41023.1] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [59, 37774.0] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_BBS_BH_GB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_BBS_BH_GB.yaml new file mode 100644 index 00000000000..584e89cf0c0 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_BBS_BH_GB.yaml @@ -0,0 +1,21903 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bjlk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [6, 31.3419] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [13, 53.7456] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [7, 93.4143] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [69, 146.041] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [64, 210.643] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [69, 274.093] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [69, 305.224] + - - [64, 128, 1, 64, 96, 96, 96, 128] + - [2, 56.1757] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [13, 103.758] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [13, 181.713] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [69, 299.529] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [64, 427.011] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [69, 542.794] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [77, 612.274] + - - [64, 256, 1, 64, 96, 96, 96, 256] + - [49, 134.519] + - - [64, 256, 1, 128, 96, 96, 160, 256] + - [6, 246.173] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [69, 422.09] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [69, 666.45] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [77, 932.586] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [73, 1148.38] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [64, 1273.73] + - - [64, 512, 1, 64, 96, 96, 96, 512] + - [46, 278.58] + - - [64, 512, 1, 128, 96, 96, 160, 512] + - [6, 510.504] + - - [64, 512, 1, 256, 96, 96, 288, 512] + - [20, 870.007] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [77, 1369.79] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [65, 1930.19] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [73, 2450.93] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [71, 2638.29] + - - [64, 1024, 1, 64, 96, 96, 96, 1024] + - [22, 579.724] + - - [64, 1024, 1, 128, 96, 96, 160, 1024] + - [77, 1053.98] + - - [64, 1024, 1, 256, 96, 96, 288, 1024] + - [77, 1859.59] + - - [64, 1024, 1, 512, 96, 96, 544, 1024] + - [69, 2902.88] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [69, 4025.49] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [69, 4916.23] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [76, 5422.94] + - - [64, 2048, 1, 64, 96, 96, 96, 2048] + - [46, 1229.65] + - - [64, 2048, 1, 128, 96, 96, 160, 2048] + - [21, 2227.16] + - - [64, 2048, 1, 256, 96, 96, 288, 2048] + - [20, 3710.96] + - - [64, 2048, 1, 512, 96, 96, 544, 2048] + - [68, 5620.51] + - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] + - [64, 7861.41] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [66, 9902.09] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [73, 9588.87] + - - [64, 4096, 1, 64, 96, 96, 96, 4096] + - [58, 2126.93] + - - [64, 4096, 1, 128, 96, 96, 160, 4096] + - [21, 3867.96] + - - [64, 4096, 1, 256, 96, 96, 288, 4096] + - [46, 6570.3] + - - [64, 4096, 1, 512, 96, 96, 544, 4096] + - [5, 10059.1] + - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] + - [56, 14007.3] + - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] + - [21, 17248.3] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [77, 14709.2] + - - [128, 64, 1, 64, 160, 160, 96, 96] + - [10, 66.475] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [13, 116.405] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [6, 186.812] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [6, 292.246] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [64, 418.656] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [66, 539.218] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [66, 607.452] + - - [128, 128, 1, 64, 160, 160, 96, 128] + - [3, 139.847] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [46, 265.867] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [46, 450.662] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [7, 688.155] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [69, 958.698] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [69, 1167.31] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [66, 1261.47] + - - [128, 256, 1, 64, 160, 160, 96, 256] + - [8, 325.746] + - - [128, 256, 1, 128, 160, 160, 160, 256] + - [46, 588.426] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [13, 1019.4] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [46, 1515.01] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [13, 1995.98] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [64, 2391.45] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [74, 2590.03] + - - [128, 512, 1, 64, 160, 160, 96, 512] + - [22, 728.81] + - - [128, 512, 1, 128, 160, 160, 160, 512] + - [22, 1304.4] + - - [128, 512, 1, 256, 160, 160, 288, 512] + - [22, 2133.42] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [74, 3046.25] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [74, 4068.44] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [71, 4851.89] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [76, 5304.0] + - - [128, 1024, 1, 64, 160, 160, 96, 1024] + - [22, 1411.51] + - - [128, 1024, 1, 128, 160, 160, 160, 1024] + - [13, 2529.74] + - - [128, 1024, 1, 256, 160, 160, 288, 1024] + - [22, 4143.03] + - - [128, 1024, 1, 512, 160, 160, 544, 1024] + - [77, 6194.28] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [77, 8205.52] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [71, 9977.9] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [77, 10612.6] + - - [128, 2048, 1, 64, 160, 160, 96, 2048] + - [13, 2663.9] + - - [128, 2048, 1, 128, 160, 160, 160, 2048] + - [58, 4954.15] + - - [128, 2048, 1, 256, 160, 160, 288, 2048] + - [13, 8181.04] + - - [128, 2048, 1, 512, 160, 160, 544, 2048] + - [69, 12073.2] + - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] + - [70, 16273.8] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [66, 19675.0] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [63, 21060.4] + - - [128, 4096, 1, 64, 160, 160, 96, 4096] + - [58, 4735.98] + - - [128, 4096, 1, 128, 160, 160, 160, 4096] + - [6, 8661.44] + - - [128, 4096, 1, 256, 160, 160, 288, 4096] + - [55, 14341.1] + - - [128, 4096, 1, 512, 160, 160, 544, 4096] + - [6, 21721.6] + - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] + - [33, 28980.9] + - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] + - [45, 35093.1] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [45, 33039.3] + - - [256, 64, 1, 64, 288, 288, 96, 96] + - [2, 142.141] + - - [256, 64, 1, 128, 288, 288, 160, 160] + - [21, 250.736] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [6, 408.125] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [66, 643.002] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [66, 917.088] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [66, 1139.83] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [64, 1242.11] + - - [256, 128, 1, 64, 288, 288, 96, 128] + - [46, 308.77] + - - [256, 128, 1, 128, 288, 288, 160, 160] + - [22, 587.191] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [46, 1020.52] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [22, 1517.48] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [46, 2000.74] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [74, 2370.92] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [66, 2572.11] + - - [256, 256, 1, 64, 288, 288, 96, 256] + - [58, 725.784] + - - [256, 256, 1, 128, 288, 288, 160, 256] + - [22, 1304.6] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [13, 2125.58] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [69, 3035.5] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [69, 4017.29] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [63, 4810.84] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [64, 5195.3] + - - [256, 512, 1, 64, 288, 288, 96, 512] + - [46, 1486.55] + - - [256, 512, 1, 128, 288, 288, 160, 512] + - [44, 2648.78] + - - [256, 512, 1, 256, 288, 288, 288, 512] + - [46, 4331.84] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [66, 6314.94] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [74, 8129.97] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [64, 9840.73] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [63, 10545.1] + - - [256, 1024, 1, 64, 288, 288, 96, 1024] + - [58, 2796.69] + - - [256, 1024, 1, 128, 288, 288, 160, 1024] + - [46, 4976.93] + - - [256, 1024, 1, 256, 288, 288, 288, 1024] + - [7, 8191.0] + - - [256, 1024, 1, 512, 288, 288, 544, 1024] + - [13, 12053.7] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [74, 16096.1] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [76, 19227.5] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [66, 20928.2] + - - [256, 2048, 1, 64, 288, 288, 96, 2048] + - [59, 4436.07] + - - [256, 2048, 1, 128, 288, 288, 160, 2048] + - [20, 8234.22] + - - [256, 2048, 1, 256, 288, 288, 288, 2048] + - [12, 14061.6] + - - [256, 2048, 1, 512, 288, 288, 544, 2048] + - [12, 21365.4] + - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] + - [11, 27006.9] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [12, 34570.0] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [31, 37368.4] + - - [256, 4096, 1, 64, 288, 288, 96, 4096] + - [32, 7932.49] + - - [256, 4096, 1, 128, 288, 288, 160, 4096] + - [38, 14082.2] + - - [256, 4096, 1, 256, 288, 288, 288, 4096] + - [16, 21178.4] + - - [256, 4096, 1, 512, 288, 288, 544, 4096] + - [18, 27895.2] + - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] + - [38, 33316.0] + - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] + - [3, 35361.2] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [35, 36518.4] + - - [384, 64, 1, 64, 416, 416, 96, 96] + - [42, 228.084] + - - [384, 64, 1, 128, 416, 416, 160, 160] + - [6, 402.27] + - - [384, 64, 1, 256, 416, 416, 288, 288] + - [46, 650.82] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [69, 1008.89] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [22, 1379.02] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [69, 1707.55] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [66, 1885.58] + - - [384, 128, 1, 64, 416, 416, 96, 128] + - [34, 496.798] + - - [384, 128, 1, 128, 416, 416, 160, 160] + - [46, 947.227] + - - [384, 128, 1, 256, 416, 416, 288, 288] + - [7, 1500.65] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [69, 2297.2] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [66, 3037.33] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [22, 3580.67] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [63, 3860.61] + - - [384, 256, 1, 64, 416, 416, 96, 256] + - [22, 1047.01] + - - [384, 256, 1, 128, 416, 416, 160, 256] + - [22, 1884.52] + - - [384, 256, 1, 256, 416, 416, 288, 288] + - [34, 3214.44] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [46, 4711.82] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [63, 6166.97] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [66, 7336.17] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [66, 7785.41] + - - [384, 512, 1, 64, 416, 416, 96, 512] + - [46, 2042.35] + - - [384, 512, 1, 128, 416, 416, 160, 512] + - [7, 3652.51] + - - [384, 512, 1, 256, 416, 416, 288, 512] + - [22, 6018.37] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [69, 9206.45] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [64, 12175.1] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [74, 14334.9] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [66, 15369.6] + - - [384, 1024, 1, 64, 416, 416, 96, 1024] + - [45, 3663.68] + - - [384, 1024, 1, 128, 416, 416, 160, 1024] + - [22, 6545.93] + - - [384, 1024, 1, 256, 416, 416, 288, 1024] + - [57, 10981.1] + - - [384, 1024, 1, 512, 416, 416, 544, 1024] + - [20, 16102.3] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [22, 21679.5] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [57, 26211.0] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [22, 28380.4] + - - [384, 2048, 1, 64, 416, 416, 96, 2048] + - [49, 6213.02] + - - [384, 2048, 1, 128, 416, 416, 160, 2048] + - [39, 10702.0] + - - [384, 2048, 1, 256, 416, 416, 288, 2048] + - [39, 18051.4] + - - [384, 2048, 1, 512, 416, 416, 544, 2048] + - [49, 24073.5] + - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] + - [62, 30606.0] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [26, 34036.7] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [48, 37099.4] + - - [384, 4096, 1, 64, 416, 416, 96, 4096] + - [51, 10422.8] + - - [384, 4096, 1, 128, 416, 416, 160, 4096] + - [42, 16622.1] + - - [384, 4096, 1, 256, 416, 416, 288, 4096] + - [42, 23483.8] + - - [384, 4096, 1, 512, 416, 416, 544, 4096] + - [51, 30116.2] + - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] + - [2, 33413.8] + - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] + - [2, 36691.6] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [16, 37313.6] + - - [768, 64, 1, 64, 800, 800, 96, 96] + - [39, 454.782] + - - [768, 64, 1, 128, 800, 800, 160, 160] + - [6, 810.128] + - - [768, 64, 1, 256, 800, 800, 288, 288] + - [46, 1319.24] + - - [768, 64, 1, 512, 800, 800, 544, 544] + - [66, 2038.54] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [71, 2794.96] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [77, 3451.87] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [66, 3760.23] + - - [768, 128, 1, 64, 800, 800, 96, 128] + - [22, 1034.44] + - - [768, 128, 1, 128, 800, 800, 160, 160] + - [46, 1958.73] + - - [768, 128, 1, 256, 800, 800, 288, 288] + - [58, 3195.66] + - - [768, 128, 1, 512, 800, 800, 544, 544] + - [46, 4692.49] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [77, 6136.14] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [63, 7270.21] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [64, 7702.75] + - - [768, 256, 1, 64, 800, 800, 96, 256] + - [56, 2101.71] + - - [768, 256, 1, 128, 800, 800, 160, 256] + - [46, 3774.12] + - - [768, 256, 1, 256, 800, 800, 288, 288] + - [13, 6218.43] + - - [768, 256, 1, 512, 800, 800, 544, 544] + - [74, 9186.28] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [69, 12130.3] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [71, 14302.3] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [77, 15585.0] + - - [768, 512, 1, 64, 800, 800, 96, 512] + - [38, 3626.2] + - - [768, 512, 1, 128, 800, 800, 160, 512] + - [31, 6595.68] + - - [768, 512, 1, 256, 800, 800, 288, 512] + - [19, 10682.7] + - - [768, 512, 1, 512, 800, 800, 544, 544] + - [20, 15968.2] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [21, 21041.7] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [58, 26153.1] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [33, 28365.9] + - - [768, 1024, 1, 64, 800, 800, 96, 1024] + - [51, 5971.27] + - - [768, 1024, 1, 128, 800, 800, 160, 1024] + - [39, 10776.5] + - - [768, 1024, 1, 256, 800, 800, 288, 1024] + - [54, 18203.2] + - - [768, 1024, 1, 512, 800, 800, 544, 1024] + - [60, 24603.0] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [62, 31483.1] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [59, 34001.4] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [59, 37627.6] + - - [768, 2048, 1, 64, 800, 800, 96, 2048] + - [15, 10474.8] + - - [768, 2048, 1, 128, 800, 800, 160, 2048] + - [30, 16803.8] + - - [768, 2048, 1, 256, 800, 800, 288, 2048] + - [27, 24098.0] + - - [768, 2048, 1, 512, 800, 800, 544, 2048] + - [39, 31332.4] + - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] + - [18, 34262.5] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [3, 37580.7] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [18, 39058.2] + - - [768, 4096, 1, 64, 800, 800, 96, 4096] + - [47, 13920.1] + - - [768, 4096, 1, 128, 800, 800, 160, 4096] + - [16, 21061.5] + - - [768, 4096, 1, 256, 800, 800, 288, 4096] + - [51, 28053.6] + - - [768, 4096, 1, 512, 800, 800, 544, 4096] + - [40, 32705.5] + - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] + - [16, 37264.6] + - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] + - [18, 39923.5] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [41, 40574.3] + - - [1536, 64, 1, 64, 1568, 1568, 96, 96] + - [46, 963.47] + - - [1536, 64, 1, 128, 1568, 1568, 160, 160] + - [22, 1623.18] + - - [1536, 64, 1, 256, 1568, 1568, 288, 288] + - [58, 2724.16] + - - [1536, 64, 1, 512, 1568, 1568, 544, 544] + - [5, 4143.21] + - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] + - [69, 5591.48] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [69, 6922.48] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [64, 7706.44] + - - [1536, 128, 1, 64, 1568, 1568, 96, 128] + - [42, 1887.63] + - - [1536, 128, 1, 128, 1568, 1568, 160, 160] + - [22, 3597.17] + - - [1536, 128, 1, 256, 1568, 1568, 288, 288] + - [22, 5963.48] + - - [1536, 128, 1, 512, 1568, 1568, 544, 544] + - [46, 8840.97] + - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] + - [64, 11817.7] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [74, 14597.9] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [64, 15648.9] + - - [1536, 256, 1, 64, 1568, 1568, 96, 256] + - [55, 3489.92] + - - [1536, 256, 1, 128, 1568, 1568, 160, 256] + - [45, 6316.72] + - - [1536, 256, 1, 256, 1568, 1568, 288, 288] + - [33, 10590.6] + - - [1536, 256, 1, 512, 1568, 1568, 544, 544] + - [45, 16378.7] + - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] + - [46, 21949.0] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [45, 26374.1] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [43, 28173.3] + - - [1536, 512, 1, 64, 1568, 1568, 96, 512] + - [24, 6168.85] + - - [1536, 512, 1, 128, 1568, 1568, 160, 512] + - [24, 10743.1] + - - [1536, 512, 1, 256, 1568, 1568, 288, 512] + - [42, 17652.5] + - - [1536, 512, 1, 512, 1568, 1568, 544, 544] + - [28, 24610.6] + - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] + - [24, 31603.0] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [24, 34763.2] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [47, 37426.5] + - - [1536, 1024, 1, 64, 1568, 1568, 96, 1024] + - [38, 10887.3] + - - [1536, 1024, 1, 128, 1568, 1568, 160, 1024] + - [39, 16999.6] + - - [1536, 1024, 1, 256, 1568, 1568, 288, 1024] + - [41, 24244.5] + - - [1536, 1024, 1, 512, 1568, 1568, 544, 1024] + - [39, 31010.3] + - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] + - [39, 34354.6] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [18, 37523.8] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [18, 39432.3] + - - [1536, 2048, 1, 64, 1568, 1568, 96, 2048] + - [47, 13686.4] + - - [1536, 2048, 1, 128, 1568, 1568, 160, 2048] + - [42, 20792.8] + - - [1536, 2048, 1, 256, 1568, 1568, 288, 2048] + - [54, 28334.9] + - - [1536, 2048, 1, 512, 1568, 1568, 544, 2048] + - [16, 32858.9] + - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 2048] + - [54, 37154.2] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [18, 39958.7] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [18, 40498.2] + - - [1536, 4096, 1, 64, 1568, 1568, 96, 4096] + - [54, 16087.5] + - - [1536, 4096, 1, 128, 1568, 1568, 160, 4096] + - [39, 23586.3] + - - [1536, 4096, 1, 256, 1568, 1568, 288, 4096] + - [39, 29628.1] + - - [1536, 4096, 1, 512, 1568, 1568, 544, 4096] + - [39, 35542.2] + - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 4096] + - [54, 39207.7] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] + - [16, 40647.0] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [17, 41058.2] + - - [3072, 64, 1, 64, 3104, 3104, 96, 96] + - [19, 1678.62] + - - [3072, 64, 1, 128, 3104, 3104, 160, 160] + - [55, 2911.7] + - - [3072, 64, 1, 256, 3104, 3104, 288, 288] + - [57, 5019.12] + - - [3072, 64, 1, 512, 3104, 3104, 544, 544] + - [19, 7567.53] + - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] + - [6, 10735.7] + - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] + - [4, 12655.7] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [75, 10298.3] + - - [3072, 128, 1, 64, 3104, 3104, 96, 128] + - [55, 3220.61] + - - [3072, 128, 1, 128, 3104, 3104, 160, 160] + - [19, 6127.56] + - - [3072, 128, 1, 256, 3104, 3104, 288, 288] + - [33, 10076.4] + - - [3072, 128, 1, 512, 3104, 3104, 544, 544] + - [19, 14908.7] + - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] + - [20, 21530.0] + - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] + - [44, 26064.2] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [71, 21717.2] + - - [3072, 256, 1, 64, 3104, 3104, 96, 256] + - [24, 5793.24] + - - [3072, 256, 1, 128, 3104, 3104, 160, 256] + - [60, 11558.5] + - - [3072, 256, 1, 256, 3104, 3104, 288, 288] + - [49, 19328.6] + - - [3072, 256, 1, 512, 3104, 3104, 544, 544] + - [26, 26360.3] + - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] + - [47, 31997.2] + - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] + - [23, 34932.9] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [9, 36577.8] + - - [3072, 512, 1, 64, 3104, 3104, 96, 512] + - [38, 9776.01] + - - [3072, 512, 1, 128, 3104, 3104, 160, 512] + - [27, 17366.3] + - - [3072, 512, 1, 256, 3104, 3104, 288, 512] + - [39, 24780.2] + - - [3072, 512, 1, 512, 3104, 3104, 544, 544] + - [51, 31003.2] + - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] + - [18, 34266.9] + - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] + - [16, 37733.5] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [2, 38526.8] + - - [3072, 1024, 1, 64, 3104, 3104, 96, 1024] + - [39, 13986.8] + - - [3072, 1024, 1, 128, 3104, 3104, 160, 1024] + - [42, 21180.0] + - - [3072, 1024, 1, 256, 3104, 3104, 288, 1024] + - [51, 28100.6] + - - [3072, 1024, 1, 512, 3104, 3104, 544, 1024] + - [18, 33081.0] + - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] + - [18, 37274.5] + - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] + - [16, 39924.7] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [0, 39642.0] + - - [3072, 2048, 1, 64, 3104, 3104, 96, 2048] + - [39, 16096.5] + - - [3072, 2048, 1, 128, 3104, 3104, 160, 2048] + - [39, 23578.0] + - - [3072, 2048, 1, 256, 3104, 3104, 288, 2048] + - [16, 29621.6] + - - [3072, 2048, 1, 512, 3104, 3104, 544, 2048] + - [42, 35377.9] + - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 2048] + - [18, 39180.5] + - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] + - [18, 40769.0] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [52, 40033.8] + - - [3072, 4096, 1, 64, 3104, 3104, 96, 4096] + - [39, 16953.1] + - - [3072, 4096, 1, 128, 3104, 3104, 160, 4096] + - [51, 24535.6] + - - [3072, 4096, 1, 256, 3104, 3104, 288, 4096] + - [54, 31692.5] + - - [3072, 4096, 1, 512, 3104, 3104, 544, 4096] + - [39, 36868.0] + - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 4096] + - [51, 39602.3] + - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 4096] + - [53, 40759.8] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [59, 38496.4] + - - [4096, 64, 1, 64, 4128, 4128, 96, 96] + - [38, 1810.23] + - - [4096, 64, 1, 128, 4128, 4128, 160, 160] + - [12, 3632.24] + - - [4096, 64, 1, 256, 4128, 4128, 288, 288] + - [1, 5986.52] + - - [4096, 64, 1, 512, 4128, 4128, 544, 544] + - [50, 9024.89] + - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] + - [18, 12521.5] + - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] + - [1, 15810.3] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [67, 14132.5] + - - [4096, 128, 1, 64, 4128, 4128, 96, 128] + - [39, 5069.41] + - - [4096, 128, 1, 128, 4128, 4128, 160, 160] + - [61, 8922.89] + - - [4096, 128, 1, 256, 4128, 4128, 288, 288] + - [30, 14676.7] + - - [4096, 128, 1, 512, 4128, 4128, 544, 544] + - [16, 21728.7] + - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] + - [39, 29012.2] + - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] + - [2, 34178.2] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [72, 28705.1] + - - [4096, 256, 1, 64, 4128, 4128, 96, 256] + - [36, 8789.66] + - - [4096, 256, 1, 128, 4128, 4128, 160, 256] + - [50, 14625.4] + - - [4096, 256, 1, 256, 4128, 4128, 288, 288] + - [37, 21662.0] + - - [4096, 256, 1, 512, 4128, 4128, 544, 544] + - [29, 28523.6] + - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] + - [2, 33504.2] + - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] + - [18, 35874.5] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [14, 36589.4] + - - [4096, 512, 1, 64, 4128, 4128, 96, 512] + - [25, 11798.3] + - - [4096, 512, 1, 128, 4128, 4128, 160, 512] + - [16, 18593.6] + - - [4096, 512, 1, 256, 4128, 4128, 288, 512] + - [39, 25839.7] + - - [4096, 512, 1, 512, 4128, 4128, 544, 544] + - [16, 32826.1] + - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] + - [39, 35634.0] + - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] + - [18, 38853.7] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [61, 38321.1] + - - [4096, 1024, 1, 64, 4128, 4128, 96, 1024] + - [42, 14780.9] + - - [4096, 1024, 1, 128, 4128, 4128, 160, 1024] + - [16, 22119.9] + - - [4096, 1024, 1, 256, 4128, 4128, 288, 1024] + - [39, 28047.5] + - - [4096, 1024, 1, 512, 4128, 4128, 544, 1024] + - [42, 34050.3] + - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] + - [16, 38228.2] + - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] + - [18, 40709.9] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [42, 39644.9] + - - [4096, 2048, 1, 64, 4128, 4128, 96, 2048] + - [42, 16864.2] + - - [4096, 2048, 1, 128, 4128, 4128, 160, 2048] + - [39, 23391.0] + - - [4096, 2048, 1, 256, 4128, 4128, 288, 2048] + - [51, 30651.7] + - - [4096, 2048, 1, 512, 4128, 4128, 544, 2048] + - [39, 36170.6] + - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 2048] + - [16, 39707.2] + - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] + - [18, 40750.7] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [38, 40255.2] + - - [4096, 4096, 1, 64, 4128, 4128, 96, 4096] + - [46, 17374.8] + - - [4096, 4096, 1, 128, 4128, 4128, 160, 4096] + - [42, 24761.4] + - - [4096, 4096, 1, 256, 4128, 4128, 288, 4096] + - [39, 31607.4] + - - [4096, 4096, 1, 512, 4128, 4128, 544, 4096] + - [39, 37076.6] + - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 4096] + - [51, 39402.8] + - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 4096] + - [18, 41023.1] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [59, 37774.0] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_HB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_HB.yaml new file mode 100644 index 00000000000..d30c3329d74 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_HB.yaml @@ -0,0 +1,16503 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [2, 36.6584] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [15, 66.2482] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [10, 105.216] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [47, 166.441] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [47, 231.129] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [47, 288.994] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [53, 320.819] + - - [64, 128, 1, 64, 96, 96, 96, 128] + - [12, 64.6632] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [28, 118.644] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [53, 203.726] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [47, 323.485] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [47, 461.979] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [47, 577.549] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [47, 646.098] + - - [64, 256, 1, 64, 96, 96, 96, 256] + - [41, 147.065] + - - [64, 256, 1, 128, 96, 96, 160, 256] + - [28, 268.041] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [20, 452.655] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [53, 708.497] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [47, 981.124] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [47, 1204.74] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [53, 1312.13] + - - [64, 512, 1, 64, 96, 96, 96, 512] + - [19, 317.318] + - - [64, 512, 1, 128, 96, 96, 160, 512] + - [47, 574.326] + - - [64, 512, 1, 256, 96, 96, 288, 512] + - [47, 966.875] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [53, 1487.87] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [53, 2018.92] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [47, 2505.66] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [52, 2672.86] + - - [64, 1024, 1, 64, 96, 96, 96, 1024] + - [40, 655.975] + - - [64, 1024, 1, 128, 96, 96, 160, 1024] + - [55, 1184.0] + - - [64, 1024, 1, 256, 96, 96, 288, 1024] + - [55, 1997.77] + - - [64, 1024, 1, 512, 96, 96, 544, 1024] + - [53, 3070.23] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [53, 4166.19] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [46, 5088.44] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [49, 5478.84] + - - [64, 2048, 1, 64, 96, 96, 96, 2048] + - [38, 1391.61] + - - [64, 2048, 1, 128, 96, 96, 160, 2048] + - [17, 2498.1] + - - [64, 2048, 1, 256, 96, 96, 288, 2048] + - [55, 4133.34] + - - [64, 2048, 1, 512, 96, 96, 544, 2048] + - [53, 6256.67] + - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] + - [46, 8287.1] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [53, 10379.9] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [54, 10796.2] + - - [64, 4096, 1, 64, 96, 96, 96, 4096] + - [7, 2505.18] + - - [64, 4096, 1, 128, 96, 96, 160, 4096] + - [17, 4460.26] + - - [64, 4096, 1, 256, 96, 96, 288, 4096] + - [38, 7472.32] + - - [64, 4096, 1, 512, 96, 96, 544, 4096] + - [27, 11140.2] + - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] + - [27, 15017.4] + - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] + - [9, 18041.2] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [53, 14841.4] + - - [128, 64, 1, 64, 160, 160, 96, 96] + - [28, 73.44] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [15, 134.969] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [10, 204.58] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [28, 320.031] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [47, 449.913] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [47, 565.651] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [47, 627.139] + - - [128, 128, 1, 64, 160, 160, 96, 128] + - [45, 173.605] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [10, 323.535] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [10, 531.867] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [10, 782.228] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [55, 1023.06] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [47, 1219.1] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [53, 1300.54] + - - [128, 256, 1, 64, 160, 160, 96, 256] + - [10, 367.728] + - - [128, 256, 1, 128, 160, 160, 160, 256] + - [10, 654.133] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [8, 1063.87] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [53, 1571.05] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [47, 2075.75] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [50, 2502.85] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [50, 2630.79] + - - [128, 512, 1, 64, 160, 160, 96, 512] + - [28, 799.524] + - - [128, 512, 1, 128, 160, 160, 160, 512] + - [28, 1398.11] + - - [128, 512, 1, 256, 160, 160, 288, 512] + - [41, 2267.51] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [47, 3304.57] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [53, 4283.19] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [55, 4984.32] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [55, 5312.82] + - - [128, 1024, 1, 64, 160, 160, 96, 1024] + - [41, 1612.26] + - - [128, 1024, 1, 128, 160, 160, 160, 1024] + - [28, 2817.33] + - - [128, 1024, 1, 256, 160, 160, 288, 1024] + - [28, 4377.62] + - - [128, 1024, 1, 512, 160, 160, 544, 1024] + - [47, 6467.09] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [55, 8650.85] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [50, 10209.4] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [55, 10739.8] + - - [128, 2048, 1, 64, 160, 160, 96, 2048] + - [10, 2976.79] + - - [128, 2048, 1, 128, 160, 160, 160, 2048] + - [26, 5254.37] + - - [128, 2048, 1, 256, 160, 160, 288, 2048] + - [28, 8608.14] + - - [128, 2048, 1, 512, 160, 160, 544, 2048] + - [55, 12586.1] + - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] + - [50, 16523.2] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [53, 20043.7] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [54, 20504.6] + - - [128, 4096, 1, 64, 160, 160, 96, 4096] + - [20, 4916.4] + - - [128, 4096, 1, 128, 160, 160, 160, 4096] + - [41, 9095.84] + - - [128, 4096, 1, 256, 160, 160, 288, 4096] + - [28, 15075.6] + - - [128, 4096, 1, 512, 160, 160, 544, 4096] + - [41, 22510.4] + - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] + - [28, 30695.9] + - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] + - [9, 36539.3] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [28, 32997.1] + - - [256, 64, 1, 64, 288, 288, 96, 96] + - [20, 170.5] + - - [256, 64, 1, 128, 288, 288, 160, 160] + - [9, 311.427] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [10, 493.623] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [47, 717.895] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [47, 977.867] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [54, 1172.13] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [55, 1279.22] + - - [256, 128, 1, 64, 288, 288, 96, 128] + - [28, 379.988] + - - [256, 128, 1, 128, 288, 288, 160, 160] + - [10, 684.229] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [10, 1111.66] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [10, 1621.93] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [53, 2105.84] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [53, 2496.43] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [50, 2621.85] + - - [256, 256, 1, 64, 288, 288, 96, 256] + - [32, 754.643] + - - [256, 256, 1, 128, 288, 288, 160, 256] + - [28, 1333.44] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [53, 2253.49] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [55, 3297.43] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [50, 4272.82] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [47, 5041.8] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [47, 5292.29] + - - [256, 512, 1, 64, 288, 288, 96, 512] + - [26, 1523.26] + - - [256, 512, 1, 128, 288, 288, 160, 512] + - [8, 2693.4] + - - [256, 512, 1, 256, 288, 288, 288, 512] + - [28, 4398.86] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [47, 6423.75] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [53, 8414.39] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [53, 9963.46] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [55, 10609.7] + - - [256, 1024, 1, 64, 288, 288, 96, 1024] + - [20, 2974.16] + - - [256, 1024, 1, 128, 288, 288, 160, 1024] + - [10, 5281.67] + - - [256, 1024, 1, 256, 288, 288, 288, 1024] + - [10, 8563.08] + - - [256, 1024, 1, 512, 288, 288, 544, 1024] + - [47, 12304.5] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [47, 16288.6] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [53, 19631.1] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [54, 20980.1] + - - [256, 2048, 1, 64, 288, 288, 96, 2048] + - [15, 4992.48] + - - [256, 2048, 1, 128, 288, 288, 160, 2048] + - [12, 8894.51] + - - [256, 2048, 1, 256, 288, 288, 288, 2048] + - [15, 14519.5] + - - [256, 2048, 1, 512, 288, 288, 544, 2048] + - [12, 21004.4] + - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] + - [13, 27105.2] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [39, 34704.0] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [7, 37233.6] + - - [256, 4096, 1, 64, 288, 288, 96, 4096] + - [34, 9151.66] + - - [256, 4096, 1, 128, 288, 288, 160, 4096] + - [22, 15080.6] + - - [256, 4096, 1, 256, 288, 288, 288, 4096] + - [22, 23705.0] + - - [256, 4096, 1, 512, 288, 288, 544, 4096] + - [2, 30326.6] + - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] + - [0, 34929.8] + - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] + - [2, 36372.8] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [42, 36913.0] + - - [384, 64, 1, 64, 416, 416, 96, 96] + - [25, 250.377] + - - [384, 64, 1, 128, 416, 416, 160, 160] + - [7, 453.603] + - - [384, 64, 1, 256, 416, 416, 288, 288] + - [4, 746.76] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [47, 1100.39] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [53, 1488.22] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [53, 1797.37] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [47, 1925.06] + - - [384, 128, 1, 64, 416, 416, 96, 128] + - [26, 544.998] + - - [384, 128, 1, 128, 416, 416, 160, 160] + - [10, 992.66] + - - [384, 128, 1, 256, 416, 416, 288, 288] + - [28, 1618.38] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [53, 2448.04] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [55, 3157.97] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [47, 3737.13] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [50, 3965.47] + - - [384, 256, 1, 64, 416, 416, 96, 256] + - [20, 1203.19] + - - [384, 256, 1, 128, 416, 416, 160, 256] + - [10, 2120.83] + - - [384, 256, 1, 256, 416, 416, 288, 288] + - [10, 3282.79] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [53, 4840.06] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [53, 6297.75] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [50, 7403.08] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [55, 7934.69] + - - [384, 512, 1, 64, 416, 416, 96, 512] + - [41, 2306.25] + - - [384, 512, 1, 128, 416, 416, 160, 512] + - [20, 4076.76] + - - [384, 512, 1, 256, 416, 416, 288, 512] + - [41, 6625.2] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [53, 9664.32] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [53, 12355.9] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [50, 14727.6] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [50, 15674.8] + - - [384, 1024, 1, 64, 416, 416, 96, 1024] + - [34, 3781.49] + - - [384, 1024, 1, 128, 416, 416, 160, 1024] + - [34, 6701.06] + - - [384, 1024, 1, 256, 416, 416, 288, 1024] + - [39, 11046.2] + - - [384, 1024, 1, 512, 416, 416, 544, 1024] + - [41, 17203.0] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [40, 22367.2] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [39, 26115.0] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [39, 28500.4] + - - [384, 2048, 1, 64, 416, 416, 96, 2048] + - [34, 6486.04] + - - [384, 2048, 1, 128, 416, 416, 160, 2048] + - [56, 11157.5] + - - [384, 2048, 1, 256, 416, 416, 288, 2048] + - [22, 18842.0] + - - [384, 2048, 1, 512, 416, 416, 544, 2048] + - [43, 26955.0] + - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] + - [13, 32923.4] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [11, 35412.1] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [30, 38037.8] + - - [384, 4096, 1, 64, 416, 416, 96, 4096] + - [23, 12973.7] + - - [384, 4096, 1, 128, 416, 416, 160, 4096] + - [25, 20040.5] + - - [384, 4096, 1, 256, 416, 416, 288, 4096] + - [2, 27114.7] + - - [384, 4096, 1, 512, 416, 416, 544, 4096] + - [22, 32410.6] + - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] + - [21, 34530.7] + - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] + - [25, 37698.6] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [1, 37848.7] + - - [768, 64, 1, 64, 800, 800, 96, 96] + - [1, 518.071] + - - [768, 64, 1, 128, 800, 800, 160, 160] + - [9, 947.937] + - - [768, 64, 1, 256, 800, 800, 288, 288] + - [27, 1453.33] + - - [768, 64, 1, 512, 800, 800, 544, 544] + - [53, 2206.95] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [53, 2972.93] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [53, 3626.99] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [46, 3875.54] + - - [768, 128, 1, 64, 800, 800, 96, 128] + - [20, 1111.37] + - - [768, 128, 1, 128, 800, 800, 160, 160] + - [10, 2008.13] + - - [768, 128, 1, 256, 800, 800, 288, 288] + - [20, 3269.56] + - - [768, 128, 1, 512, 800, 800, 544, 544] + - [53, 4787.11] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [50, 6266.0] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [47, 7419.72] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [50, 7903.38] + - - [768, 256, 1, 64, 800, 800, 96, 256] + - [10, 2287.4] + - - [768, 256, 1, 128, 800, 800, 160, 256] + - [28, 4036.87] + - - [768, 256, 1, 256, 800, 800, 288, 288] + - [8, 6519.66] + - - [768, 256, 1, 512, 800, 800, 544, 544] + - [47, 9379.75] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [47, 12341.5] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [53, 14587.3] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [55, 15924.6] + - - [768, 512, 1, 64, 800, 800, 96, 512] + - [26, 3966.89] + - - [768, 512, 1, 128, 800, 800, 160, 512] + - [16, 7002.18] + - - [768, 512, 1, 256, 800, 800, 288, 512] + - [16, 11369.2] + - - [768, 512, 1, 512, 800, 800, 544, 544] + - [41, 16722.9] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [18, 21521.9] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [41, 26239.2] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [40, 28634.7] + - - [768, 1024, 1, 64, 800, 800, 96, 1024] + - [45, 6681.49] + - - [768, 1024, 1, 128, 800, 800, 160, 1024] + - [24, 11541.3] + - - [768, 1024, 1, 256, 800, 800, 288, 1024] + - [34, 19459.4] + - - [768, 1024, 1, 512, 800, 800, 544, 1024] + - [30, 26103.9] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [45, 32141.6] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [13, 34545.5] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [32, 37948.6] + - - [768, 2048, 1, 64, 800, 800, 96, 2048] + - [16, 13085.1] + - - [768, 2048, 1, 128, 800, 800, 160, 2048] + - [6, 19831.2] + - - [768, 2048, 1, 256, 800, 800, 288, 2048] + - [25, 27256.1] + - - [768, 2048, 1, 512, 800, 800, 544, 2048] + - [25, 33344.6] + - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] + - [34, 35708.1] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [0, 38321.0] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [2, 39535.7] + - - [768, 4096, 1, 64, 800, 800, 96, 4096] + - [30, 17710.0] + - - [768, 4096, 1, 128, 800, 800, 160, 4096] + - [23, 25159.5] + - - [768, 4096, 1, 256, 800, 800, 288, 4096] + - [22, 32405.4] + - - [768, 4096, 1, 512, 800, 800, 544, 4096] + - [6, 35575.6] + - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] + - [2, 39104.9] + - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] + - [6, 41104.4] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [25, 41108.6] + - - [1536, 64, 1, 64, 1568, 1568, 96, 96] + - [1, 1037.85] + - - [1536, 64, 1, 128, 1568, 1568, 160, 160] + - [3, 1794.48] + - - [1536, 64, 1, 256, 1568, 1568, 288, 288] + - [47, 2883.35] + - - [1536, 64, 1, 512, 1568, 1568, 544, 544] + - [53, 4409.26] + - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] + - [53, 5905.4] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [55, 7219.89] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [53, 7926.88] + - - [1536, 128, 1, 64, 1568, 1568, 96, 128] + - [1, 2154.98] + - - [1536, 128, 1, 128, 1568, 1568, 160, 160] + - [10, 3805.51] + - - [1536, 128, 1, 256, 1568, 1568, 288, 288] + - [10, 6264.83] + - - [1536, 128, 1, 512, 1568, 1568, 544, 544] + - [53, 9255.57] + - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] + - [47, 12215.0] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [46, 14717.9] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [47, 16145.2] + - - [1536, 256, 1, 64, 1568, 1568, 96, 256] + - [41, 3954.4] + - - [1536, 256, 1, 128, 1568, 1568, 160, 256] + - [20, 7055.18] + - - [1536, 256, 1, 256, 1568, 1568, 288, 288] + - [26, 11583.8] + - - [1536, 256, 1, 512, 1568, 1568, 544, 544] + - [9, 17219.2] + - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] + - [10, 22442.0] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [8, 26709.1] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [41, 27642.9] + - - [1536, 512, 1, 64, 1568, 1568, 96, 512] + - [6, 6768.65] + - - [1536, 512, 1, 128, 1568, 1568, 160, 512] + - [32, 11558.5] + - - [1536, 512, 1, 256, 1568, 1568, 288, 512] + - [35, 19234.4] + - - [1536, 512, 1, 512, 1568, 1568, 544, 544] + - [12, 25982.7] + - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] + - [12, 32229.0] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [29, 34549.2] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [32, 38137.3] + - - [1536, 1024, 1, 64, 1568, 1568, 96, 1024] + - [34, 12700.4] + - - [1536, 1024, 1, 128, 1568, 1568, 160, 1024] + - [25, 19892.0] + - - [1536, 1024, 1, 256, 1568, 1568, 288, 1024] + - [25, 27507.4] + - - [1536, 1024, 1, 512, 1568, 1568, 544, 1024] + - [22, 33418.0] + - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] + - [37, 35893.5] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [5, 38562.8] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [2, 40237.7] + - - [1536, 2048, 1, 64, 1568, 1568, 96, 2048] + - [32, 18191.6] + - - [1536, 2048, 1, 128, 1568, 1568, 160, 2048] + - [22, 25969.3] + - - [1536, 2048, 1, 256, 1568, 1568, 288, 2048] + - [25, 32777.4] + - - [1536, 2048, 1, 512, 1568, 1568, 544, 2048] + - [22, 35404.3] + - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 2048] + - [37, 39146.7] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [6, 41178.5] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [2, 41193.0] + - - [1536, 4096, 1, 64, 1568, 1568, 96, 4096] + - [30, 22263.3] + - - [1536, 4096, 1, 128, 1568, 1568, 160, 4096] + - [34, 30174.9] + - - [1536, 4096, 1, 256, 1568, 1568, 288, 4096] + - [25, 34769.9] + - - [1536, 4096, 1, 512, 1568, 1568, 544, 4096] + - [25, 38601.2] + - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 4096] + - [34, 41378.2] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] + - [2, 41700.9] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [1, 42008.5] + - - [3072, 64, 1, 64, 3104, 3104, 96, 96] + - [9, 1972.24] + - - [3072, 64, 1, 128, 3104, 3104, 160, 160] + - [27, 3181.12] + - - [3072, 64, 1, 256, 3104, 3104, 288, 288] + - [19, 5311.49] + - - [3072, 64, 1, 512, 3104, 3104, 544, 544] + - [41, 7775.64] + - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] + - [8, 10743.2] + - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] + - [8, 12693.6] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [3, 11954.7] + - - [3072, 128, 1, 64, 3104, 3104, 96, 128] + - [8, 3746.03] + - - [3072, 128, 1, 128, 3104, 3104, 160, 160] + - [4, 6693.06] + - - [3072, 128, 1, 256, 3104, 3104, 288, 288] + - [10, 10504.4] + - - [3072, 128, 1, 512, 3104, 3104, 544, 544] + - [27, 15992.3] + - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] + - [7, 21466.8] + - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] + - [26, 27083.7] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [33, 22653.8] + - - [3072, 256, 1, 64, 3104, 3104, 96, 256] + - [15, 6472.69] + - - [3072, 256, 1, 128, 3104, 3104, 160, 256] + - [0, 11221.0] + - - [3072, 256, 1, 256, 3104, 3104, 288, 288] + - [13, 18726.4] + - - [3072, 256, 1, 512, 3104, 3104, 544, 544] + - [32, 25910.8] + - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] + - [32, 33064.0] + - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] + - [11, 35196.2] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [30, 37078.1] + - - [3072, 512, 1, 64, 3104, 3104, 96, 512] + - [22, 12697.2] + - - [3072, 512, 1, 128, 3104, 3104, 160, 512] + - [6, 19957.1] + - - [3072, 512, 1, 256, 3104, 3104, 288, 512] + - [2, 27432.4] + - - [3072, 512, 1, 512, 3104, 3104, 544, 544] + - [25, 33431.8] + - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] + - [25, 35728.7] + - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] + - [6, 38738.6] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [22, 38129.4] + - - [3072, 1024, 1, 64, 3104, 3104, 96, 1024] + - [32, 17600.0] + - - [3072, 1024, 1, 128, 3104, 3104, 160, 1024] + - [25, 25620.6] + - - [3072, 1024, 1, 256, 3104, 3104, 288, 1024] + - [22, 32972.0] + - - [3072, 1024, 1, 512, 3104, 3104, 544, 1024] + - [6, 35763.6] + - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] + - [34, 39213.0] + - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] + - [2, 41225.1] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [22, 40048.3] + - - [3072, 2048, 1, 64, 3104, 3104, 96, 2048] + - [32, 22276.8] + - - [3072, 2048, 1, 128, 3104, 3104, 160, 2048] + - [15, 29866.1] + - - [3072, 2048, 1, 256, 3104, 3104, 288, 2048] + - [22, 34564.7] + - - [3072, 2048, 1, 512, 3104, 3104, 544, 2048] + - [6, 38560.5] + - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 2048] + - [37, 41381.6] + - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] + - [6, 42072.0] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [35, 40767.3] + - - [3072, 4096, 1, 64, 3104, 3104, 96, 4096] + - [32, 25918.3] + - - [3072, 4096, 1, 128, 3104, 3104, 160, 4096] + - [2, 31324.0] + - - [3072, 4096, 1, 256, 3104, 3104, 288, 4096] + - [22, 37431.8] + - - [3072, 4096, 1, 512, 3104, 3104, 544, 4096] + - [6, 40440.3] + - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 4096] + - [34, 41824.2] + - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 4096] + - [2, 42125.6] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [36, 39694.2] + - - [4096, 64, 1, 64, 4128, 4128, 96, 96] + - [51, 2225.09] + - - [4096, 64, 1, 128, 4128, 4128, 160, 160] + - [48, 4034.46] + - - [4096, 64, 1, 256, 4128, 4128, 288, 288] + - [31, 6528.73] + - - [4096, 64, 1, 512, 4128, 4128, 544, 544] + - [14, 9786.22] + - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] + - [1, 13383.6] + - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] + - [1, 16551.2] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [57, 14478.9] + - - [4096, 128, 1, 64, 4128, 4128, 96, 128] + - [44, 5777.28] + - - [4096, 128, 1, 128, 4128, 4128, 160, 160] + - [34, 10209.8] + - - [4096, 128, 1, 256, 4128, 4128, 288, 288] + - [6, 16418.1] + - - [4096, 128, 1, 512, 4128, 4128, 544, 544] + - [22, 23707.1] + - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] + - [22, 30505.8] + - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] + - [6, 35357.7] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [57, 29006.0] + - - [4096, 256, 1, 64, 4128, 4128, 96, 256] + - [32, 9791.2] + - - [4096, 256, 1, 128, 4128, 4128, 160, 256] + - [34, 16029.8] + - - [4096, 256, 1, 256, 4128, 4128, 288, 288] + - [21, 23253.2] + - - [4096, 256, 1, 512, 4128, 4128, 544, 544] + - [21, 30055.0] + - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] + - [21, 35151.7] + - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] + - [23, 36970.2] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [43, 35837.1] + - - [4096, 512, 1, 64, 4128, 4128, 96, 512] + - [2, 15224.4] + - - [4096, 512, 1, 128, 4128, 4128, 160, 512] + - [15, 22812.6] + - - [4096, 512, 1, 256, 4128, 4128, 288, 512] + - [6, 30100.5] + - - [4096, 512, 1, 512, 4128, 4128, 544, 544] + - [2, 35160.9] + - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] + - [2, 37283.4] + - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] + - [22, 39988.6] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [45, 38778.3] + - - [4096, 1024, 1, 64, 4128, 4128, 96, 1024] + - [30, 20231.8] + - - [4096, 1024, 1, 128, 4128, 4128, 160, 1024] + - [22, 28039.5] + - - [4096, 1024, 1, 256, 4128, 4128, 288, 1024] + - [22, 34507.7] + - - [4096, 1024, 1, 512, 4128, 4128, 544, 1024] + - [22, 37051.8] + - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] + - [34, 40294.0] + - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] + - [2, 41995.2] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [21, 40689.7] + - - [4096, 2048, 1, 64, 4128, 4128, 96, 2048] + - [30, 23944.0] + - - [4096, 2048, 1, 128, 4128, 4128, 160, 2048] + - [22, 31228.9] + - - [4096, 2048, 1, 256, 4128, 4128, 288, 2048] + - [25, 35623.4] + - - [4096, 2048, 1, 512, 4128, 4128, 544, 2048] + - [25, 39644.6] + - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 2048] + - [34, 42000.7] + - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] + - [6, 42446.6] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [22, 41119.4] + - - [4096, 4096, 1, 64, 4128, 4128, 96, 4096] + - [30, 24087.9] + - - [4096, 4096, 1, 128, 4128, 4128, 160, 4096] + - [32, 30360.0] + - - [4096, 4096, 1, 256, 4128, 4128, 288, 4096] + - [34, 34619.0] + - - [4096, 4096, 1, 512, 4128, 4128, 544, 4096] + - [25, 40413.4] + - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 4096] + - [34, 42253.0] + - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 4096] + - [6, 42182.1] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [44, 38552.0] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_HB_GB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_HB_GB.yaml new file mode 100644 index 00000000000..9e862185372 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_HB_GB.yaml @@ -0,0 +1,16503 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW4_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW4_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bjlk_HB_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [2, 36.6584] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [15, 66.2482] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [10, 105.216] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [47, 166.441] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [47, 231.129] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [47, 288.994] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [53, 320.819] + - - [64, 128, 1, 64, 96, 96, 96, 128] + - [12, 64.6632] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [28, 118.644] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [53, 203.726] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [47, 323.485] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [47, 461.979] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [47, 577.549] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [47, 646.098] + - - [64, 256, 1, 64, 96, 96, 96, 256] + - [41, 147.065] + - - [64, 256, 1, 128, 96, 96, 160, 256] + - [28, 268.041] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [20, 452.655] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [53, 708.497] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [47, 981.124] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [47, 1204.74] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [53, 1312.13] + - - [64, 512, 1, 64, 96, 96, 96, 512] + - [19, 317.318] + - - [64, 512, 1, 128, 96, 96, 160, 512] + - [47, 574.326] + - - [64, 512, 1, 256, 96, 96, 288, 512] + - [47, 966.875] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [53, 1487.87] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [53, 2018.92] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [47, 2505.66] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [52, 2672.86] + - - [64, 1024, 1, 64, 96, 96, 96, 1024] + - [40, 655.975] + - - [64, 1024, 1, 128, 96, 96, 160, 1024] + - [55, 1184.0] + - - [64, 1024, 1, 256, 96, 96, 288, 1024] + - [55, 1997.77] + - - [64, 1024, 1, 512, 96, 96, 544, 1024] + - [53, 3070.23] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [53, 4166.19] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [46, 5088.44] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [49, 5478.84] + - - [64, 2048, 1, 64, 96, 96, 96, 2048] + - [38, 1391.61] + - - [64, 2048, 1, 128, 96, 96, 160, 2048] + - [17, 2498.1] + - - [64, 2048, 1, 256, 96, 96, 288, 2048] + - [55, 4133.34] + - - [64, 2048, 1, 512, 96, 96, 544, 2048] + - [53, 6256.67] + - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] + - [46, 8287.1] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [53, 10379.9] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [54, 10796.2] + - - [64, 4096, 1, 64, 96, 96, 96, 4096] + - [7, 2505.18] + - - [64, 4096, 1, 128, 96, 96, 160, 4096] + - [17, 4460.26] + - - [64, 4096, 1, 256, 96, 96, 288, 4096] + - [38, 7472.32] + - - [64, 4096, 1, 512, 96, 96, 544, 4096] + - [27, 11140.2] + - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] + - [27, 15017.4] + - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] + - [9, 18041.2] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [53, 14841.4] + - - [128, 64, 1, 64, 160, 160, 96, 96] + - [28, 73.44] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [15, 134.969] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [10, 204.58] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [28, 320.031] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [47, 449.913] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [47, 565.651] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [47, 627.139] + - - [128, 128, 1, 64, 160, 160, 96, 128] + - [45, 173.605] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [10, 323.535] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [10, 531.867] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [10, 782.228] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [55, 1023.06] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [47, 1219.1] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [53, 1300.54] + - - [128, 256, 1, 64, 160, 160, 96, 256] + - [10, 367.728] + - - [128, 256, 1, 128, 160, 160, 160, 256] + - [10, 654.133] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [8, 1063.87] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [53, 1571.05] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [47, 2075.75] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [50, 2502.85] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [50, 2630.79] + - - [128, 512, 1, 64, 160, 160, 96, 512] + - [28, 799.524] + - - [128, 512, 1, 128, 160, 160, 160, 512] + - [28, 1398.11] + - - [128, 512, 1, 256, 160, 160, 288, 512] + - [41, 2267.51] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [47, 3304.57] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [53, 4283.19] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [55, 4984.32] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [55, 5312.82] + - - [128, 1024, 1, 64, 160, 160, 96, 1024] + - [41, 1612.26] + - - [128, 1024, 1, 128, 160, 160, 160, 1024] + - [28, 2817.33] + - - [128, 1024, 1, 256, 160, 160, 288, 1024] + - [28, 4377.62] + - - [128, 1024, 1, 512, 160, 160, 544, 1024] + - [47, 6467.09] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [55, 8650.85] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [50, 10209.4] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [55, 10739.8] + - - [128, 2048, 1, 64, 160, 160, 96, 2048] + - [10, 2976.79] + - - [128, 2048, 1, 128, 160, 160, 160, 2048] + - [26, 5254.37] + - - [128, 2048, 1, 256, 160, 160, 288, 2048] + - [28, 8608.14] + - - [128, 2048, 1, 512, 160, 160, 544, 2048] + - [55, 12586.1] + - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] + - [50, 16523.2] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [53, 20043.7] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [54, 20504.6] + - - [128, 4096, 1, 64, 160, 160, 96, 4096] + - [20, 4916.4] + - - [128, 4096, 1, 128, 160, 160, 160, 4096] + - [41, 9095.84] + - - [128, 4096, 1, 256, 160, 160, 288, 4096] + - [28, 15075.6] + - - [128, 4096, 1, 512, 160, 160, 544, 4096] + - [41, 22510.4] + - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] + - [28, 30695.9] + - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] + - [9, 36539.3] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [28, 32997.1] + - - [256, 64, 1, 64, 288, 288, 96, 96] + - [20, 170.5] + - - [256, 64, 1, 128, 288, 288, 160, 160] + - [9, 311.427] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [10, 493.623] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [47, 717.895] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [47, 977.867] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [54, 1172.13] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [55, 1279.22] + - - [256, 128, 1, 64, 288, 288, 96, 128] + - [28, 379.988] + - - [256, 128, 1, 128, 288, 288, 160, 160] + - [10, 684.229] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [10, 1111.66] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [10, 1621.93] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [53, 2105.84] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [53, 2496.43] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [50, 2621.85] + - - [256, 256, 1, 64, 288, 288, 96, 256] + - [32, 754.643] + - - [256, 256, 1, 128, 288, 288, 160, 256] + - [28, 1333.44] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [53, 2253.49] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [55, 3297.43] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [50, 4272.82] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [47, 5041.8] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [47, 5292.29] + - - [256, 512, 1, 64, 288, 288, 96, 512] + - [26, 1523.26] + - - [256, 512, 1, 128, 288, 288, 160, 512] + - [8, 2693.4] + - - [256, 512, 1, 256, 288, 288, 288, 512] + - [28, 4398.86] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [47, 6423.75] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [53, 8414.39] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [53, 9963.46] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [55, 10609.7] + - - [256, 1024, 1, 64, 288, 288, 96, 1024] + - [20, 2974.16] + - - [256, 1024, 1, 128, 288, 288, 160, 1024] + - [10, 5281.67] + - - [256, 1024, 1, 256, 288, 288, 288, 1024] + - [10, 8563.08] + - - [256, 1024, 1, 512, 288, 288, 544, 1024] + - [47, 12304.5] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [47, 16288.6] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [53, 19631.1] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [54, 20980.1] + - - [256, 2048, 1, 64, 288, 288, 96, 2048] + - [15, 4992.48] + - - [256, 2048, 1, 128, 288, 288, 160, 2048] + - [12, 8894.51] + - - [256, 2048, 1, 256, 288, 288, 288, 2048] + - [15, 14519.5] + - - [256, 2048, 1, 512, 288, 288, 544, 2048] + - [12, 21004.4] + - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] + - [13, 27105.2] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [39, 34704.0] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [7, 37233.6] + - - [256, 4096, 1, 64, 288, 288, 96, 4096] + - [34, 9151.66] + - - [256, 4096, 1, 128, 288, 288, 160, 4096] + - [22, 15080.6] + - - [256, 4096, 1, 256, 288, 288, 288, 4096] + - [22, 23705.0] + - - [256, 4096, 1, 512, 288, 288, 544, 4096] + - [2, 30326.6] + - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] + - [0, 34929.8] + - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] + - [2, 36372.8] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [42, 36913.0] + - - [384, 64, 1, 64, 416, 416, 96, 96] + - [25, 250.377] + - - [384, 64, 1, 128, 416, 416, 160, 160] + - [7, 453.603] + - - [384, 64, 1, 256, 416, 416, 288, 288] + - [4, 746.76] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [47, 1100.39] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [53, 1488.22] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [53, 1797.37] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [47, 1925.06] + - - [384, 128, 1, 64, 416, 416, 96, 128] + - [26, 544.998] + - - [384, 128, 1, 128, 416, 416, 160, 160] + - [10, 992.66] + - - [384, 128, 1, 256, 416, 416, 288, 288] + - [28, 1618.38] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [53, 2448.04] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [55, 3157.97] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [47, 3737.13] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [50, 3965.47] + - - [384, 256, 1, 64, 416, 416, 96, 256] + - [20, 1203.19] + - - [384, 256, 1, 128, 416, 416, 160, 256] + - [10, 2120.83] + - - [384, 256, 1, 256, 416, 416, 288, 288] + - [10, 3282.79] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [53, 4840.06] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [53, 6297.75] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [50, 7403.08] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [55, 7934.69] + - - [384, 512, 1, 64, 416, 416, 96, 512] + - [41, 2306.25] + - - [384, 512, 1, 128, 416, 416, 160, 512] + - [20, 4076.76] + - - [384, 512, 1, 256, 416, 416, 288, 512] + - [41, 6625.2] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [53, 9664.32] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [53, 12355.9] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [50, 14727.6] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [50, 15674.8] + - - [384, 1024, 1, 64, 416, 416, 96, 1024] + - [34, 3781.49] + - - [384, 1024, 1, 128, 416, 416, 160, 1024] + - [34, 6701.06] + - - [384, 1024, 1, 256, 416, 416, 288, 1024] + - [39, 11046.2] + - - [384, 1024, 1, 512, 416, 416, 544, 1024] + - [41, 17203.0] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [40, 22367.2] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [39, 26115.0] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [39, 28500.4] + - - [384, 2048, 1, 64, 416, 416, 96, 2048] + - [34, 6486.04] + - - [384, 2048, 1, 128, 416, 416, 160, 2048] + - [56, 11157.5] + - - [384, 2048, 1, 256, 416, 416, 288, 2048] + - [22, 18842.0] + - - [384, 2048, 1, 512, 416, 416, 544, 2048] + - [43, 26955.0] + - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] + - [13, 32923.4] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [11, 35412.1] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [30, 38037.8] + - - [384, 4096, 1, 64, 416, 416, 96, 4096] + - [23, 12973.7] + - - [384, 4096, 1, 128, 416, 416, 160, 4096] + - [25, 20040.5] + - - [384, 4096, 1, 256, 416, 416, 288, 4096] + - [2, 27114.7] + - - [384, 4096, 1, 512, 416, 416, 544, 4096] + - [22, 32410.6] + - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] + - [21, 34530.7] + - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] + - [25, 37698.6] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [1, 37848.7] + - - [768, 64, 1, 64, 800, 800, 96, 96] + - [1, 518.071] + - - [768, 64, 1, 128, 800, 800, 160, 160] + - [9, 947.937] + - - [768, 64, 1, 256, 800, 800, 288, 288] + - [27, 1453.33] + - - [768, 64, 1, 512, 800, 800, 544, 544] + - [53, 2206.95] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [53, 2972.93] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [53, 3626.99] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [46, 3875.54] + - - [768, 128, 1, 64, 800, 800, 96, 128] + - [20, 1111.37] + - - [768, 128, 1, 128, 800, 800, 160, 160] + - [10, 2008.13] + - - [768, 128, 1, 256, 800, 800, 288, 288] + - [20, 3269.56] + - - [768, 128, 1, 512, 800, 800, 544, 544] + - [53, 4787.11] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [50, 6266.0] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [47, 7419.72] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [50, 7903.38] + - - [768, 256, 1, 64, 800, 800, 96, 256] + - [10, 2287.4] + - - [768, 256, 1, 128, 800, 800, 160, 256] + - [28, 4036.87] + - - [768, 256, 1, 256, 800, 800, 288, 288] + - [8, 6519.66] + - - [768, 256, 1, 512, 800, 800, 544, 544] + - [47, 9379.75] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [47, 12341.5] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [53, 14587.3] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [55, 15924.6] + - - [768, 512, 1, 64, 800, 800, 96, 512] + - [26, 3966.89] + - - [768, 512, 1, 128, 800, 800, 160, 512] + - [16, 7002.18] + - - [768, 512, 1, 256, 800, 800, 288, 512] + - [16, 11369.2] + - - [768, 512, 1, 512, 800, 800, 544, 544] + - [41, 16722.9] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [18, 21521.9] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [41, 26239.2] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [40, 28634.7] + - - [768, 1024, 1, 64, 800, 800, 96, 1024] + - [45, 6681.49] + - - [768, 1024, 1, 128, 800, 800, 160, 1024] + - [24, 11541.3] + - - [768, 1024, 1, 256, 800, 800, 288, 1024] + - [34, 19459.4] + - - [768, 1024, 1, 512, 800, 800, 544, 1024] + - [30, 26103.9] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [45, 32141.6] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [13, 34545.5] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [32, 37948.6] + - - [768, 2048, 1, 64, 800, 800, 96, 2048] + - [16, 13085.1] + - - [768, 2048, 1, 128, 800, 800, 160, 2048] + - [6, 19831.2] + - - [768, 2048, 1, 256, 800, 800, 288, 2048] + - [25, 27256.1] + - - [768, 2048, 1, 512, 800, 800, 544, 2048] + - [25, 33344.6] + - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] + - [34, 35708.1] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [0, 38321.0] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [2, 39535.7] + - - [768, 4096, 1, 64, 800, 800, 96, 4096] + - [30, 17710.0] + - - [768, 4096, 1, 128, 800, 800, 160, 4096] + - [23, 25159.5] + - - [768, 4096, 1, 256, 800, 800, 288, 4096] + - [22, 32405.4] + - - [768, 4096, 1, 512, 800, 800, 544, 4096] + - [6, 35575.6] + - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] + - [2, 39104.9] + - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] + - [6, 41104.4] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [25, 41108.6] + - - [1536, 64, 1, 64, 1568, 1568, 96, 96] + - [1, 1037.85] + - - [1536, 64, 1, 128, 1568, 1568, 160, 160] + - [3, 1794.48] + - - [1536, 64, 1, 256, 1568, 1568, 288, 288] + - [47, 2883.35] + - - [1536, 64, 1, 512, 1568, 1568, 544, 544] + - [53, 4409.26] + - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] + - [53, 5905.4] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [55, 7219.89] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [53, 7926.88] + - - [1536, 128, 1, 64, 1568, 1568, 96, 128] + - [1, 2154.98] + - - [1536, 128, 1, 128, 1568, 1568, 160, 160] + - [10, 3805.51] + - - [1536, 128, 1, 256, 1568, 1568, 288, 288] + - [10, 6264.83] + - - [1536, 128, 1, 512, 1568, 1568, 544, 544] + - [53, 9255.57] + - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] + - [47, 12215.0] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [46, 14717.9] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [47, 16145.2] + - - [1536, 256, 1, 64, 1568, 1568, 96, 256] + - [41, 3954.4] + - - [1536, 256, 1, 128, 1568, 1568, 160, 256] + - [20, 7055.18] + - - [1536, 256, 1, 256, 1568, 1568, 288, 288] + - [26, 11583.8] + - - [1536, 256, 1, 512, 1568, 1568, 544, 544] + - [9, 17219.2] + - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] + - [10, 22442.0] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [8, 26709.1] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [41, 27642.9] + - - [1536, 512, 1, 64, 1568, 1568, 96, 512] + - [6, 6768.65] + - - [1536, 512, 1, 128, 1568, 1568, 160, 512] + - [32, 11558.5] + - - [1536, 512, 1, 256, 1568, 1568, 288, 512] + - [35, 19234.4] + - - [1536, 512, 1, 512, 1568, 1568, 544, 544] + - [12, 25982.7] + - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] + - [12, 32229.0] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [29, 34549.2] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [32, 38137.3] + - - [1536, 1024, 1, 64, 1568, 1568, 96, 1024] + - [34, 12700.4] + - - [1536, 1024, 1, 128, 1568, 1568, 160, 1024] + - [25, 19892.0] + - - [1536, 1024, 1, 256, 1568, 1568, 288, 1024] + - [25, 27507.4] + - - [1536, 1024, 1, 512, 1568, 1568, 544, 1024] + - [22, 33418.0] + - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] + - [37, 35893.5] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [5, 38562.8] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [2, 40237.7] + - - [1536, 2048, 1, 64, 1568, 1568, 96, 2048] + - [32, 18191.6] + - - [1536, 2048, 1, 128, 1568, 1568, 160, 2048] + - [22, 25969.3] + - - [1536, 2048, 1, 256, 1568, 1568, 288, 2048] + - [25, 32777.4] + - - [1536, 2048, 1, 512, 1568, 1568, 544, 2048] + - [22, 35404.3] + - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 2048] + - [37, 39146.7] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [6, 41178.5] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [2, 41193.0] + - - [1536, 4096, 1, 64, 1568, 1568, 96, 4096] + - [30, 22263.3] + - - [1536, 4096, 1, 128, 1568, 1568, 160, 4096] + - [34, 30174.9] + - - [1536, 4096, 1, 256, 1568, 1568, 288, 4096] + - [25, 34769.9] + - - [1536, 4096, 1, 512, 1568, 1568, 544, 4096] + - [25, 38601.2] + - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 4096] + - [34, 41378.2] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] + - [2, 41700.9] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [1, 42008.5] + - - [3072, 64, 1, 64, 3104, 3104, 96, 96] + - [9, 1972.24] + - - [3072, 64, 1, 128, 3104, 3104, 160, 160] + - [27, 3181.12] + - - [3072, 64, 1, 256, 3104, 3104, 288, 288] + - [19, 5311.49] + - - [3072, 64, 1, 512, 3104, 3104, 544, 544] + - [41, 7775.64] + - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] + - [8, 10743.2] + - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] + - [8, 12693.6] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [3, 11954.7] + - - [3072, 128, 1, 64, 3104, 3104, 96, 128] + - [8, 3746.03] + - - [3072, 128, 1, 128, 3104, 3104, 160, 160] + - [4, 6693.06] + - - [3072, 128, 1, 256, 3104, 3104, 288, 288] + - [10, 10504.4] + - - [3072, 128, 1, 512, 3104, 3104, 544, 544] + - [27, 15992.3] + - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] + - [7, 21466.8] + - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] + - [26, 27083.7] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [33, 22653.8] + - - [3072, 256, 1, 64, 3104, 3104, 96, 256] + - [15, 6472.69] + - - [3072, 256, 1, 128, 3104, 3104, 160, 256] + - [0, 11221.0] + - - [3072, 256, 1, 256, 3104, 3104, 288, 288] + - [13, 18726.4] + - - [3072, 256, 1, 512, 3104, 3104, 544, 544] + - [32, 25910.8] + - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] + - [32, 33064.0] + - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] + - [11, 35196.2] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [30, 37078.1] + - - [3072, 512, 1, 64, 3104, 3104, 96, 512] + - [22, 12697.2] + - - [3072, 512, 1, 128, 3104, 3104, 160, 512] + - [6, 19957.1] + - - [3072, 512, 1, 256, 3104, 3104, 288, 512] + - [2, 27432.4] + - - [3072, 512, 1, 512, 3104, 3104, 544, 544] + - [25, 33431.8] + - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] + - [25, 35728.7] + - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] + - [6, 38738.6] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [22, 38129.4] + - - [3072, 1024, 1, 64, 3104, 3104, 96, 1024] + - [32, 17600.0] + - - [3072, 1024, 1, 128, 3104, 3104, 160, 1024] + - [25, 25620.6] + - - [3072, 1024, 1, 256, 3104, 3104, 288, 1024] + - [22, 32972.0] + - - [3072, 1024, 1, 512, 3104, 3104, 544, 1024] + - [6, 35763.6] + - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] + - [34, 39213.0] + - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] + - [2, 41225.1] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [22, 40048.3] + - - [3072, 2048, 1, 64, 3104, 3104, 96, 2048] + - [32, 22276.8] + - - [3072, 2048, 1, 128, 3104, 3104, 160, 2048] + - [15, 29866.1] + - - [3072, 2048, 1, 256, 3104, 3104, 288, 2048] + - [22, 34564.7] + - - [3072, 2048, 1, 512, 3104, 3104, 544, 2048] + - [6, 38560.5] + - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 2048] + - [37, 41381.6] + - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] + - [6, 42072.0] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [35, 40767.3] + - - [3072, 4096, 1, 64, 3104, 3104, 96, 4096] + - [32, 25918.3] + - - [3072, 4096, 1, 128, 3104, 3104, 160, 4096] + - [2, 31324.0] + - - [3072, 4096, 1, 256, 3104, 3104, 288, 4096] + - [22, 37431.8] + - - [3072, 4096, 1, 512, 3104, 3104, 544, 4096] + - [6, 40440.3] + - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 4096] + - [34, 41824.2] + - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 4096] + - [2, 42125.6] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [36, 39694.2] + - - [4096, 64, 1, 64, 4128, 4128, 96, 96] + - [51, 2225.09] + - - [4096, 64, 1, 128, 4128, 4128, 160, 160] + - [48, 4034.46] + - - [4096, 64, 1, 256, 4128, 4128, 288, 288] + - [31, 6528.73] + - - [4096, 64, 1, 512, 4128, 4128, 544, 544] + - [14, 9786.22] + - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] + - [1, 13383.6] + - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] + - [1, 16551.2] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [57, 14478.9] + - - [4096, 128, 1, 64, 4128, 4128, 96, 128] + - [44, 5777.28] + - - [4096, 128, 1, 128, 4128, 4128, 160, 160] + - [34, 10209.8] + - - [4096, 128, 1, 256, 4128, 4128, 288, 288] + - [6, 16418.1] + - - [4096, 128, 1, 512, 4128, 4128, 544, 544] + - [22, 23707.1] + - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] + - [22, 30505.8] + - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] + - [6, 35357.7] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [57, 29006.0] + - - [4096, 256, 1, 64, 4128, 4128, 96, 256] + - [32, 9791.2] + - - [4096, 256, 1, 128, 4128, 4128, 160, 256] + - [34, 16029.8] + - - [4096, 256, 1, 256, 4128, 4128, 288, 288] + - [21, 23253.2] + - - [4096, 256, 1, 512, 4128, 4128, 544, 544] + - [21, 30055.0] + - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] + - [21, 35151.7] + - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] + - [23, 36970.2] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [43, 35837.1] + - - [4096, 512, 1, 64, 4128, 4128, 96, 512] + - [2, 15224.4] + - - [4096, 512, 1, 128, 4128, 4128, 160, 512] + - [15, 22812.6] + - - [4096, 512, 1, 256, 4128, 4128, 288, 512] + - [6, 30100.5] + - - [4096, 512, 1, 512, 4128, 4128, 544, 544] + - [2, 35160.9] + - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] + - [2, 37283.4] + - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] + - [22, 39988.6] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [45, 38778.3] + - - [4096, 1024, 1, 64, 4128, 4128, 96, 1024] + - [30, 20231.8] + - - [4096, 1024, 1, 128, 4128, 4128, 160, 1024] + - [22, 28039.5] + - - [4096, 1024, 1, 256, 4128, 4128, 288, 1024] + - [22, 34507.7] + - - [4096, 1024, 1, 512, 4128, 4128, 544, 1024] + - [22, 37051.8] + - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] + - [34, 40294.0] + - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] + - [2, 41995.2] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [21, 40689.7] + - - [4096, 2048, 1, 64, 4128, 4128, 96, 2048] + - [30, 23944.0] + - - [4096, 2048, 1, 128, 4128, 4128, 160, 2048] + - [22, 31228.9] + - - [4096, 2048, 1, 256, 4128, 4128, 288, 2048] + - [25, 35623.4] + - - [4096, 2048, 1, 512, 4128, 4128, 544, 2048] + - [25, 39644.6] + - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 2048] + - [34, 42000.7] + - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] + - [6, 42446.6] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [22, 41119.4] + - - [4096, 4096, 1, 64, 4128, 4128, 96, 4096] + - [30, 24087.9] + - - [4096, 4096, 1, 128, 4128, 4128, 160, 4096] + - [32, 30360.0] + - - [4096, 4096, 1, 256, 4128, 4128, 288, 4096] + - [34, 34619.0] + - - [4096, 4096, 1, 512, 4128, 4128, 544, 4096] + - [25, 40413.4] + - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 4096] + - [34, 42253.0] + - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 4096] + - [6, 42182.1] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [44, 38552.0] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_HHS_BH.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_HHS_BH.yaml new file mode 100644 index 00000000000..0d5abb1e5a9 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_HHS_BH.yaml @@ -0,0 +1,17313 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [4, 35.3724] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [19, 59.1948] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [46, 102.141] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [9, 158.587] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [49, 223.864] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [54, 281.195] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [46, 309.224] + - - [64, 128, 1, 64, 96, 96, 96, 128] + - [2, 63.0231] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [19, 114.237] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [46, 197.807] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [51, 312.682] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [54, 440.624] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [49, 557.161] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [54, 619.337] + - - [64, 256, 1, 64, 96, 96, 96, 256] + - [9, 140.09] + - - [64, 256, 1, 128, 96, 96, 160, 256] + - [46, 254.695] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [18, 434.285] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [45, 697.773] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [46, 946.208] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [49, 1151.45] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [54, 1270.91] + - - [64, 512, 1, 64, 96, 96, 96, 512] + - [47, 299.166] + - - [64, 512, 1, 128, 96, 96, 160, 512] + - [11, 568.105] + - - [64, 512, 1, 256, 96, 96, 288, 512] + - [46, 952.169] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [46, 1458.89] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [54, 1940.35] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [46, 2353.13] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [54, 2564.69] + - - [64, 1024, 1, 64, 96, 96, 96, 1024] + - [46, 617.445] + - - [64, 1024, 1, 128, 96, 96, 160, 1024] + - [9, 1113.58] + - - [64, 1024, 1, 256, 96, 96, 288, 1024] + - [46, 1884.66] + - - [64, 1024, 1, 512, 96, 96, 544, 1024] + - [50, 2876.02] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [49, 3952.46] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [48, 4816.71] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [46, 5224.21] + - - [64, 2048, 1, 64, 96, 96, 96, 2048] + - [18, 1235.8] + - - [64, 2048, 1, 128, 96, 96, 160, 2048] + - [18, 2225.69] + - - [64, 2048, 1, 256, 96, 96, 288, 2048] + - [53, 3744.09] + - - [64, 2048, 1, 512, 96, 96, 544, 2048] + - [53, 5785.25] + - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] + - [53, 7990.58] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [54, 10169.9] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [48, 10509.8] + - - [64, 4096, 1, 64, 96, 96, 96, 4096] + - [8, 2249.57] + - - [64, 4096, 1, 128, 96, 96, 160, 4096] + - [8, 4087.02] + - - [64, 4096, 1, 256, 96, 96, 288, 4096] + - [18, 6900.65] + - - [64, 4096, 1, 512, 96, 96, 544, 4096] + - [6, 10499.7] + - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] + - [16, 14324.2] + - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] + - [9, 17619.1] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [53, 14861.1] + - - [128, 64, 1, 64, 160, 160, 96, 96] + - [44, 73.5843] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [2, 127.751] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [18, 203.173] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [7, 315.884] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [51, 440.186] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [54, 553.247] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [54, 619.863] + - - [128, 128, 1, 64, 160, 160, 96, 128] + - [37, 169.207] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [37, 295.041] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [18, 487.993] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [9, 734.361] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [51, 978.208] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [51, 1184.71] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [46, 1277.36] + - - [128, 256, 1, 64, 160, 160, 96, 256] + - [34, 356.962] + - - [128, 256, 1, 128, 160, 160, 160, 256] + - [16, 627.61] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [9, 1035.25] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [18, 1535.81] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [49, 2030.77] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [49, 2461.81] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [49, 2584.59] + - - [128, 512, 1, 64, 160, 160, 96, 512] + - [11, 739.867] + - - [128, 512, 1, 128, 160, 160, 160, 512] + - [18, 1309.9] + - - [128, 512, 1, 256, 160, 160, 288, 512] + - [9, 2145.15] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [46, 3157.48] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [56, 4113.59] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [51, 4952.51] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [54, 5203.55] + - - [128, 1024, 1, 64, 160, 160, 96, 1024] + - [18, 1494.76] + - - [128, 1024, 1, 128, 160, 160, 160, 1024] + - [16, 2639.17] + - - [128, 1024, 1, 256, 160, 160, 288, 1024] + - [18, 4302.97] + - - [128, 1024, 1, 512, 160, 160, 544, 1024] + - [59, 6406.57] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [46, 8431.3] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [46, 9816.64] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [46, 10526.7] + - - [128, 2048, 1, 64, 160, 160, 96, 2048] + - [18, 2796.67] + - - [128, 2048, 1, 128, 160, 160, 160, 2048] + - [18, 4985.8] + - - [128, 2048, 1, 256, 160, 160, 288, 2048] + - [18, 8513.11] + - - [128, 2048, 1, 512, 160, 160, 544, 2048] + - [46, 12392.0] + - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] + - [51, 16474.5] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [48, 19672.1] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [53, 20650.1] + - - [128, 4096, 1, 64, 160, 160, 96, 4096] + - [15, 5112.67] + - - [128, 4096, 1, 128, 160, 160, 160, 4096] + - [18, 9081.04] + - - [128, 4096, 1, 256, 160, 160, 288, 4096] + - [8, 14966.3] + - - [128, 4096, 1, 512, 160, 160, 544, 4096] + - [8, 22407.0] + - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] + - [15, 29438.6] + - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] + - [17, 35692.6] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [18, 33090.7] + - - [256, 64, 1, 64, 288, 288, 96, 96] + - [55, 158.803] + - - [256, 64, 1, 128, 288, 288, 160, 160] + - [18, 280.894] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [9, 447.346] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [50, 687.704] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [54, 927.328] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [54, 1154.54] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [54, 1250.8] + - - [256, 128, 1, 64, 288, 288, 96, 128] + - [11, 345.779] + - - [256, 128, 1, 128, 288, 288, 160, 160] + - [9, 629.397] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [9, 1039.09] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [51, 1534.55] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [51, 2066.04] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [54, 2430.25] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [49, 2579.62] + - - [256, 256, 1, 64, 288, 288, 96, 256] + - [34, 741.174] + - - [256, 256, 1, 128, 288, 288, 160, 256] + - [9, 1316.89] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [9, 2130.98] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [51, 3239.47] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [46, 4199.03] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [51, 4865.96] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [54, 5175.46] + - - [256, 512, 1, 64, 288, 288, 96, 512] + - [41, 1496.9] + - - [256, 512, 1, 128, 288, 288, 160, 512] + - [52, 2752.62] + - - [256, 512, 1, 256, 288, 288, 288, 512] + - [18, 4514.86] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [54, 6514.79] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [18, 8293.24] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [49, 9786.57] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [49, 10362.7] + - - [256, 1024, 1, 64, 288, 288, 96, 1024] + - [18, 2958.96] + - - [256, 1024, 1, 128, 288, 288, 160, 1024] + - [18, 5225.73] + - - [256, 1024, 1, 256, 288, 288, 288, 1024] + - [18, 8489.45] + - - [256, 1024, 1, 512, 288, 288, 544, 1024] + - [18, 12370.3] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [18, 15773.6] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [54, 19280.0] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [54, 20919.2] + - - [256, 2048, 1, 64, 288, 288, 96, 2048] + - [7, 5002.9] + - - [256, 2048, 1, 128, 288, 288, 160, 2048] + - [10, 8749.55] + - - [256, 2048, 1, 256, 288, 288, 288, 2048] + - [16, 14328.8] + - - [256, 2048, 1, 512, 288, 288, 544, 2048] + - [16, 21569.7] + - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] + - [16, 29184.2] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [7, 34958.2] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [16, 36146.9] + - - [256, 4096, 1, 64, 288, 288, 96, 4096] + - [14, 8672.66] + - - [256, 4096, 1, 128, 288, 288, 160, 4096] + - [56, 15418.5] + - - [256, 4096, 1, 256, 288, 288, 288, 4096] + - [34, 23770.1] + - - [256, 4096, 1, 512, 288, 288, 544, 4096] + - [32, 31115.7] + - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] + - [41, 37117.8] + - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] + - [28, 39090.7] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [32, 40480.0] + - - [384, 64, 1, 64, 416, 416, 96, 96] + - [7, 235.847] + - - [384, 64, 1, 128, 416, 416, 160, 160] + - [18, 416.82] + - - [384, 64, 1, 256, 416, 416, 288, 288] + - [51, 667.743] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [46, 1032.23] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [51, 1436.41] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [54, 1731.58] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [49, 1889.15] + - - [384, 128, 1, 64, 416, 416, 96, 128] + - [34, 528.96] + - - [384, 128, 1, 128, 416, 416, 160, 160] + - [9, 963.175] + - - [384, 128, 1, 256, 416, 416, 288, 288] + - [46, 1629.07] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [54, 2392.42] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [54, 3126.97] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [49, 3645.11] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [49, 3860.23] + - - [384, 256, 1, 64, 416, 416, 96, 256] + - [21, 1173.13] + - - [384, 256, 1, 128, 416, 416, 160, 256] + - [18, 2074.33] + - - [384, 256, 1, 256, 416, 416, 288, 288] + - [7, 3346.53] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [51, 4865.31] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [54, 6195.06] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [51, 7306.1] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [54, 7816.39] + - - [384, 512, 1, 64, 416, 416, 96, 512] + - [7, 2269.24] + - - [384, 512, 1, 128, 416, 416, 160, 512] + - [18, 4000.31] + - - [384, 512, 1, 256, 416, 416, 288, 512] + - [9, 6496.92] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [46, 9208.13] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [49, 12101.9] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [54, 14474.6] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [54, 15549.2] + - - [384, 1024, 1, 64, 416, 416, 96, 1024] + - [12, 3905.92] + - - [384, 1024, 1, 128, 416, 416, 160, 1024] + - [32, 6686.81] + - - [384, 1024, 1, 256, 416, 416, 288, 1024] + - [17, 11179.9] + - - [384, 1024, 1, 512, 416, 416, 544, 1024] + - [16, 16662.0] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [18, 22290.4] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [17, 26651.7] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [18, 28790.8] + - - [384, 2048, 1, 64, 416, 416, 96, 2048] + - [34, 6469.36] + - - [384, 2048, 1, 128, 416, 416, 160, 2048] + - [35, 12217.9] + - - [384, 2048, 1, 256, 416, 416, 288, 2048] + - [37, 18670.8] + - - [384, 2048, 1, 512, 416, 416, 544, 2048] + - [20, 27522.4] + - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] + - [22, 34049.6] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [26, 36463.1] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [39, 36406.3] + - - [384, 4096, 1, 64, 416, 416, 96, 4096] + - [16, 12657.3] + - - [384, 4096, 1, 128, 416, 416, 160, 4096] + - [3, 19315.6] + - - [384, 4096, 1, 256, 416, 416, 288, 4096] + - [14, 26391.4] + - - [384, 4096, 1, 512, 416, 416, 544, 4096] + - [4, 31297.2] + - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] + - [14, 33942.0] + - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] + - [13, 36838.9] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [5, 37450.5] + - - [768, 64, 1, 64, 800, 800, 96, 96] + - [16, 485.752] + - - [768, 64, 1, 128, 800, 800, 160, 160] + - [9, 820.161] + - - [768, 64, 1, 256, 800, 800, 288, 288] + - [9, 1376.23] + - - [768, 64, 1, 512, 800, 800, 544, 544] + - [46, 2110.52] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [54, 2862.03] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [49, 3506.33] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [49, 3781.92] + - - [768, 128, 1, 64, 800, 800, 96, 128] + - [18, 1078.23] + - - [768, 128, 1, 128, 800, 800, 160, 160] + - [18, 1967.0] + - - [768, 128, 1, 256, 800, 800, 288, 288] + - [7, 3195.66] + - - [768, 128, 1, 512, 800, 800, 544, 544] + - [58, 4725.98] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [49, 6249.28] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [54, 7295.51] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [49, 7735.16] + - - [768, 256, 1, 64, 800, 800, 96, 256] + - [18, 2141.41] + - - [768, 256, 1, 128, 800, 800, 160, 256] + - [9, 3815.89] + - - [768, 256, 1, 256, 800, 800, 288, 288] + - [18, 6442.04] + - - [768, 256, 1, 512, 800, 800, 544, 544] + - [18, 9399.0] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [9, 12125.2] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [54, 14413.4] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [54, 15420.0] + - - [768, 512, 1, 64, 800, 800, 96, 512] + - [37, 3905.31] + - - [768, 512, 1, 128, 800, 800, 160, 512] + - [34, 6931.78] + - - [768, 512, 1, 256, 800, 800, 288, 512] + - [18, 11536.0] + - - [768, 512, 1, 512, 800, 800, 544, 544] + - [17, 16956.7] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [6, 21543.8] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [18, 26583.1] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [18, 28557.0] + - - [768, 1024, 1, 64, 800, 800, 96, 1024] + - [34, 6702.84] + - - [768, 1024, 1, 128, 800, 800, 160, 1024] + - [34, 11594.5] + - - [768, 1024, 1, 256, 800, 800, 288, 1024] + - [23, 19451.8] + - - [768, 1024, 1, 512, 800, 800, 544, 1024] + - [30, 26711.8] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [38, 32086.5] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [26, 35618.1] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [30, 39787.9] + - - [768, 2048, 1, 64, 800, 800, 96, 2048] + - [14, 12965.4] + - - [768, 2048, 1, 128, 800, 800, 160, 2048] + - [44, 20042.6] + - - [768, 2048, 1, 256, 800, 800, 288, 2048] + - [32, 27501.8] + - - [768, 2048, 1, 512, 800, 800, 544, 2048] + - [28, 33332.3] + - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] + - [44, 36359.4] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [21, 40073.0] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [44, 41485.0] + - - [768, 4096, 1, 64, 800, 800, 96, 4096] + - [34, 17333.4] + - - [768, 4096, 1, 128, 800, 800, 160, 4096] + - [34, 25453.9] + - - [768, 4096, 1, 256, 800, 800, 288, 4096] + - [21, 32512.7] + - - [768, 4096, 1, 512, 800, 800, 544, 4096] + - [34, 35809.7] + - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] + - [44, 40149.9] + - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] + - [41, 42542.7] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [34, 42891.8] + - - [1536, 64, 1, 64, 1568, 1568, 96, 96] + - [0, 975.272] + - - [1536, 64, 1, 128, 1568, 1568, 160, 160] + - [7, 1632.24] + - - [1536, 64, 1, 256, 1568, 1568, 288, 288] + - [54, 2743.77] + - - [1536, 64, 1, 512, 1568, 1568, 544, 544] + - [54, 4194.31] + - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] + - [59, 5595.21] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [54, 7021.23] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [48, 7699.95] + - - [1536, 128, 1, 64, 1568, 1568, 96, 128] + - [12, 2143.61] + - - [1536, 128, 1, 128, 1568, 1568, 160, 160] + - [9, 3924.82] + - - [1536, 128, 1, 256, 1568, 1568, 288, 288] + - [18, 6387.27] + - - [1536, 128, 1, 512, 1568, 1568, 544, 544] + - [9, 9313.78] + - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] + - [54, 11926.9] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [46, 14667.0] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [46, 15682.7] + - - [1536, 256, 1, 64, 1568, 1568, 96, 256] + - [18, 3721.1] + - - [1536, 256, 1, 128, 1568, 1568, 160, 256] + - [6, 6660.29] + - - [1536, 256, 1, 256, 1568, 1568, 288, 288] + - [16, 11092.4] + - - [1536, 256, 1, 512, 1568, 1568, 544, 544] + - [15, 16444.3] + - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] + - [44, 21416.6] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [44, 25685.2] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [28, 27191.2] + - - [1536, 512, 1, 64, 1568, 1568, 96, 512] + - [44, 6457.74] + - - [1536, 512, 1, 128, 1568, 1568, 160, 512] + - [58, 12238.7] + - - [1536, 512, 1, 256, 1568, 1568, 288, 512] + - [26, 19137.5] + - - [1536, 512, 1, 512, 1568, 1568, 544, 544] + - [43, 27369.1] + - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] + - [38, 33599.3] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [30, 36802.3] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [29, 40257.3] + - - [1536, 1024, 1, 64, 1568, 1568, 96, 1024] + - [13, 12478.4] + - - [1536, 1024, 1, 128, 1568, 1568, 160, 1024] + - [41, 19616.7] + - - [1536, 1024, 1, 256, 1568, 1568, 288, 1024] + - [28, 27098.3] + - - [1536, 1024, 1, 512, 1568, 1568, 544, 1024] + - [44, 34226.2] + - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] + - [21, 36458.2] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [41, 40071.5] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [28, 41980.3] + - - [1536, 2048, 1, 64, 1568, 1568, 96, 2048] + - [44, 17843.4] + - - [1536, 2048, 1, 128, 1568, 1568, 160, 2048] + - [44, 25563.7] + - - [1536, 2048, 1, 256, 1568, 1568, 288, 2048] + - [41, 32223.9] + - - [1536, 2048, 1, 512, 1568, 1568, 544, 2048] + - [41, 36222.1] + - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 2048] + - [41, 40045.6] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [41, 42563.0] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [44, 43059.2] + - - [1536, 4096, 1, 64, 1568, 1568, 96, 4096] + - [41, 21679.5] + - - [1536, 4096, 1, 128, 1568, 1568, 160, 4096] + - [34, 29599.3] + - - [1536, 4096, 1, 256, 1568, 1568, 288, 4096] + - [34, 33815.1] + - - [1536, 4096, 1, 512, 1568, 1568, 544, 4096] + - [44, 38645.2] + - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 4096] + - [41, 41823.0] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] + - [44, 42770.2] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [36, 43247.3] + - - [3072, 64, 1, 64, 3104, 3104, 96, 96] + - [15, 1879.73] + - - [3072, 64, 1, 128, 3104, 3104, 160, 160] + - [31, 3021.83] + - - [3072, 64, 1, 256, 3104, 3104, 288, 288] + - [15, 5104.11] + - - [3072, 64, 1, 512, 3104, 3104, 544, 544] + - [8, 7937.49] + - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] + - [15, 10948.2] + - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] + - [7, 12625.9] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [9, 12220.3] + - - [3072, 128, 1, 64, 3104, 3104, 96, 128] + - [44, 3658.89] + - - [3072, 128, 1, 128, 3104, 3104, 160, 160] + - [16, 6709.1] + - - [3072, 128, 1, 256, 3104, 3104, 288, 288] + - [15, 10666.9] + - - [3072, 128, 1, 512, 3104, 3104, 544, 544] + - [16, 16065.0] + - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] + - [6, 21593.5] + - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] + - [44, 25465.9] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [27, 22539.3] + - - [3072, 256, 1, 64, 3104, 3104, 96, 256] + - [24, 6864.65] + - - [3072, 256, 1, 128, 3104, 3104, 160, 256] + - [60, 11658.9] + - - [3072, 256, 1, 256, 3104, 3104, 288, 288] + - [41, 18509.4] + - - [3072, 256, 1, 512, 3104, 3104, 544, 544] + - [38, 26204.2] + - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] + - [26, 34467.8] + - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] + - [42, 37075.9] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [42, 35658.3] + - - [3072, 512, 1, 64, 3104, 3104, 96, 512] + - [4, 12361.9] + - - [3072, 512, 1, 128, 3104, 3104, 160, 512] + - [5, 19468.8] + - - [3072, 512, 1, 256, 3104, 3104, 288, 512] + - [28, 27140.3] + - - [3072, 512, 1, 512, 3104, 3104, 544, 544] + - [37, 33603.5] + - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] + - [44, 36666.5] + - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] + - [44, 39971.6] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [32, 40473.6] + - - [3072, 1024, 1, 64, 3104, 3104, 96, 1024] + - [34, 17348.3] + - - [3072, 1024, 1, 128, 3104, 3104, 160, 1024] + - [34, 25067.2] + - - [3072, 1024, 1, 256, 3104, 3104, 288, 1024] + - [44, 32251.0] + - - [3072, 1024, 1, 512, 3104, 3104, 544, 1024] + - [37, 35884.7] + - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] + - [44, 40045.1] + - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] + - [44, 42618.2] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [44, 42768.5] + - - [3072, 2048, 1, 64, 3104, 3104, 96, 2048] + - [44, 21665.5] + - - [3072, 2048, 1, 128, 3104, 3104, 160, 2048] + - [37, 29264.8] + - - [3072, 2048, 1, 256, 3104, 3104, 288, 2048] + - [34, 34128.2] + - - [3072, 2048, 1, 512, 3104, 3104, 544, 2048] + - [41, 38685.1] + - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 2048] + - [44, 41801.3] + - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] + - [44, 42907.2] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [41, 42739.0] + - - [3072, 4096, 1, 64, 3104, 3104, 96, 4096] + - [34, 24917.5] + - - [3072, 4096, 1, 128, 3104, 3104, 160, 4096] + - [41, 30784.5] + - - [3072, 4096, 1, 256, 3104, 3104, 288, 4096] + - [41, 36354.5] + - - [3072, 4096, 1, 512, 3104, 3104, 544, 4096] + - [41, 40252.5] + - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 4096] + - [37, 41771.8] + - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 4096] + - [40, 42839.0] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [44, 41231.2] + - - [4096, 64, 1, 64, 4128, 4128, 96, 96] + - [1, 2395.03] + - - [4096, 64, 1, 128, 4128, 4128, 160, 160] + - [24, 4100.52] + - - [4096, 64, 1, 256, 4128, 4128, 288, 288] + - [33, 6805.48] + - - [4096, 64, 1, 512, 4128, 4128, 544, 544] + - [33, 10446.6] + - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] + - [27, 13849.0] + - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] + - [40, 16299.5] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [57, 14729.2] + - - [4096, 128, 1, 64, 4128, 4128, 96, 128] + - [37, 5800.27] + - - [4096, 128, 1, 128, 4128, 4128, 160, 160] + - [25, 10186.6] + - - [4096, 128, 1, 256, 4128, 4128, 288, 288] + - [34, 16292.6] + - - [4096, 128, 1, 512, 4128, 4128, 544, 544] + - [37, 22693.0] + - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] + - [37, 29239.8] + - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] + - [25, 34083.8] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [59, 28949.3] + - - [4096, 256, 1, 64, 4128, 4128, 96, 256] + - [37, 9727.33] + - - [4096, 256, 1, 128, 4128, 4128, 160, 256] + - [41, 16118.4] + - - [4096, 256, 1, 256, 4128, 4128, 288, 288] + - [34, 23625.8] + - - [4096, 256, 1, 512, 4128, 4128, 544, 544] + - [37, 30986.4] + - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] + - [28, 37067.8] + - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] + - [32, 38750.7] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [28, 37900.3] + - - [4096, 512, 1, 64, 4128, 4128, 96, 512] + - [41, 14441.3] + - - [4096, 512, 1, 128, 4128, 4128, 160, 512] + - [44, 21972.3] + - - [4096, 512, 1, 256, 4128, 4128, 288, 512] + - [34, 29578.1] + - - [4096, 512, 1, 512, 4128, 4128, 544, 544] + - [28, 36156.6] + - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] + - [41, 38337.7] + - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] + - [41, 41676.9] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [28, 41748.0] + - - [4096, 1024, 1, 64, 4128, 4128, 96, 1024] + - [34, 19134.4] + - - [4096, 1024, 1, 128, 4128, 4128, 160, 1024] + - [41, 27035.5] + - - [4096, 1024, 1, 256, 4128, 4128, 288, 1024] + - [37, 33723.1] + - - [4096, 1024, 1, 512, 4128, 4128, 544, 1024] + - [41, 37471.4] + - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] + - [41, 40968.1] + - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] + - [28, 43023.1] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [34, 42660.1] + - - [4096, 2048, 1, 64, 4128, 4128, 96, 2048] + - [34, 23126.1] + - - [4096, 2048, 1, 128, 4128, 4128, 160, 2048] + - [44, 30466.0] + - - [4096, 2048, 1, 256, 4128, 4128, 288, 2048] + - [41, 35304.8] + - - [4096, 2048, 1, 512, 4128, 4128, 544, 2048] + - [44, 39559.8] + - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 2048] + - [41, 42293.5] + - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] + - [44, 42711.1] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [44, 43141.1] + - - [4096, 4096, 1, 64, 4128, 4128, 96, 4096] + - [44, 23646.6] + - - [4096, 4096, 1, 128, 4128, 4128, 160, 4096] + - [37, 30074.7] + - - [4096, 4096, 1, 256, 4128, 4128, 288, 4096] + - [13, 34815.7] + - - [4096, 4096, 1, 512, 4128, 4128, 544, 4096] + - [13, 39747.8] + - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 4096] + - [44, 41444.8] + - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 4096] + - [37, 42986.9] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [60, 40108.5] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_HHS_BH_GB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_HHS_BH_GB.yaml new file mode 100644 index 00000000000..44c2991f6b6 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_HHS_BH_GB.yaml @@ -0,0 +1,17313 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR1_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA2_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA2_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW4_PLR1_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 4 + LVCB: 32 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR1_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4352 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR1_SIA3_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW4_PLR0_SIA1_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS0_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW4_PLR0_SIA1_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25088 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_EPS1_GRVW8_PLR0_SIA1_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 8 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW4_PLR0_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS0_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8704 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_EPS1_GRVW8_PLR0_SIA3_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [4, 35.3724] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [19, 59.1948] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [46, 102.141] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [9, 158.587] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [49, 223.864] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [54, 281.195] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [46, 309.224] + - - [64, 128, 1, 64, 96, 96, 96, 128] + - [2, 63.0231] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [19, 114.237] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [46, 197.807] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [51, 312.682] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [54, 440.624] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [49, 557.161] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [54, 619.337] + - - [64, 256, 1, 64, 96, 96, 96, 256] + - [9, 140.09] + - - [64, 256, 1, 128, 96, 96, 160, 256] + - [46, 254.695] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [18, 434.285] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [45, 697.773] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [46, 946.208] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [49, 1151.45] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [54, 1270.91] + - - [64, 512, 1, 64, 96, 96, 96, 512] + - [47, 299.166] + - - [64, 512, 1, 128, 96, 96, 160, 512] + - [11, 568.105] + - - [64, 512, 1, 256, 96, 96, 288, 512] + - [46, 952.169] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [46, 1458.89] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [54, 1940.35] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [46, 2353.13] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [54, 2564.69] + - - [64, 1024, 1, 64, 96, 96, 96, 1024] + - [46, 617.445] + - - [64, 1024, 1, 128, 96, 96, 160, 1024] + - [9, 1113.58] + - - [64, 1024, 1, 256, 96, 96, 288, 1024] + - [46, 1884.66] + - - [64, 1024, 1, 512, 96, 96, 544, 1024] + - [50, 2876.02] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [49, 3952.46] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [48, 4816.71] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [46, 5224.21] + - - [64, 2048, 1, 64, 96, 96, 96, 2048] + - [18, 1235.8] + - - [64, 2048, 1, 128, 96, 96, 160, 2048] + - [18, 2225.69] + - - [64, 2048, 1, 256, 96, 96, 288, 2048] + - [53, 3744.09] + - - [64, 2048, 1, 512, 96, 96, 544, 2048] + - [53, 5785.25] + - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] + - [53, 7990.58] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [54, 10169.9] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [48, 10509.8] + - - [64, 4096, 1, 64, 96, 96, 96, 4096] + - [8, 2249.57] + - - [64, 4096, 1, 128, 96, 96, 160, 4096] + - [8, 4087.02] + - - [64, 4096, 1, 256, 96, 96, 288, 4096] + - [18, 6900.65] + - - [64, 4096, 1, 512, 96, 96, 544, 4096] + - [6, 10499.7] + - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] + - [16, 14324.2] + - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] + - [9, 17619.1] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [53, 14861.1] + - - [128, 64, 1, 64, 160, 160, 96, 96] + - [44, 73.5843] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [2, 127.751] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [18, 203.173] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [7, 315.884] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [51, 440.186] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [54, 553.247] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [54, 619.863] + - - [128, 128, 1, 64, 160, 160, 96, 128] + - [37, 169.207] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [37, 295.041] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [18, 487.993] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [9, 734.361] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [51, 978.208] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [51, 1184.71] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [46, 1277.36] + - - [128, 256, 1, 64, 160, 160, 96, 256] + - [34, 356.962] + - - [128, 256, 1, 128, 160, 160, 160, 256] + - [16, 627.61] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [9, 1035.25] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [18, 1535.81] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [49, 2030.77] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [49, 2461.81] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [49, 2584.59] + - - [128, 512, 1, 64, 160, 160, 96, 512] + - [11, 739.867] + - - [128, 512, 1, 128, 160, 160, 160, 512] + - [18, 1309.9] + - - [128, 512, 1, 256, 160, 160, 288, 512] + - [9, 2145.15] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [46, 3157.48] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [56, 4113.59] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [51, 4952.51] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [54, 5203.55] + - - [128, 1024, 1, 64, 160, 160, 96, 1024] + - [18, 1494.76] + - - [128, 1024, 1, 128, 160, 160, 160, 1024] + - [16, 2639.17] + - - [128, 1024, 1, 256, 160, 160, 288, 1024] + - [18, 4302.97] + - - [128, 1024, 1, 512, 160, 160, 544, 1024] + - [59, 6406.57] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [46, 8431.3] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [46, 9816.64] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [46, 10526.7] + - - [128, 2048, 1, 64, 160, 160, 96, 2048] + - [18, 2796.67] + - - [128, 2048, 1, 128, 160, 160, 160, 2048] + - [18, 4985.8] + - - [128, 2048, 1, 256, 160, 160, 288, 2048] + - [18, 8513.11] + - - [128, 2048, 1, 512, 160, 160, 544, 2048] + - [46, 12392.0] + - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] + - [51, 16474.5] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [48, 19672.1] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [53, 20650.1] + - - [128, 4096, 1, 64, 160, 160, 96, 4096] + - [15, 5112.67] + - - [128, 4096, 1, 128, 160, 160, 160, 4096] + - [18, 9081.04] + - - [128, 4096, 1, 256, 160, 160, 288, 4096] + - [8, 14966.3] + - - [128, 4096, 1, 512, 160, 160, 544, 4096] + - [8, 22407.0] + - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] + - [15, 29438.6] + - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] + - [17, 35692.6] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [18, 33090.7] + - - [256, 64, 1, 64, 288, 288, 96, 96] + - [55, 158.803] + - - [256, 64, 1, 128, 288, 288, 160, 160] + - [18, 280.894] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [9, 447.346] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [50, 687.704] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [54, 927.328] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [54, 1154.54] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [54, 1250.8] + - - [256, 128, 1, 64, 288, 288, 96, 128] + - [11, 345.779] + - - [256, 128, 1, 128, 288, 288, 160, 160] + - [9, 629.397] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [9, 1039.09] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [51, 1534.55] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [51, 2066.04] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [54, 2430.25] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [49, 2579.62] + - - [256, 256, 1, 64, 288, 288, 96, 256] + - [34, 741.174] + - - [256, 256, 1, 128, 288, 288, 160, 256] + - [9, 1316.89] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [9, 2130.98] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [51, 3239.47] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [46, 4199.03] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [51, 4865.96] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [54, 5175.46] + - - [256, 512, 1, 64, 288, 288, 96, 512] + - [41, 1496.9] + - - [256, 512, 1, 128, 288, 288, 160, 512] + - [52, 2752.62] + - - [256, 512, 1, 256, 288, 288, 288, 512] + - [18, 4514.86] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [54, 6514.79] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [18, 8293.24] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [49, 9786.57] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [49, 10362.7] + - - [256, 1024, 1, 64, 288, 288, 96, 1024] + - [18, 2958.96] + - - [256, 1024, 1, 128, 288, 288, 160, 1024] + - [18, 5225.73] + - - [256, 1024, 1, 256, 288, 288, 288, 1024] + - [18, 8489.45] + - - [256, 1024, 1, 512, 288, 288, 544, 1024] + - [18, 12370.3] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [18, 15773.6] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [54, 19280.0] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [54, 20919.2] + - - [256, 2048, 1, 64, 288, 288, 96, 2048] + - [7, 5002.9] + - - [256, 2048, 1, 128, 288, 288, 160, 2048] + - [10, 8749.55] + - - [256, 2048, 1, 256, 288, 288, 288, 2048] + - [16, 14328.8] + - - [256, 2048, 1, 512, 288, 288, 544, 2048] + - [16, 21569.7] + - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] + - [16, 29184.2] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [7, 34958.2] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [16, 36146.9] + - - [256, 4096, 1, 64, 288, 288, 96, 4096] + - [14, 8672.66] + - - [256, 4096, 1, 128, 288, 288, 160, 4096] + - [56, 15418.5] + - - [256, 4096, 1, 256, 288, 288, 288, 4096] + - [34, 23770.1] + - - [256, 4096, 1, 512, 288, 288, 544, 4096] + - [32, 31115.7] + - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] + - [41, 37117.8] + - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] + - [28, 39090.7] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [32, 40480.0] + - - [384, 64, 1, 64, 416, 416, 96, 96] + - [7, 235.847] + - - [384, 64, 1, 128, 416, 416, 160, 160] + - [18, 416.82] + - - [384, 64, 1, 256, 416, 416, 288, 288] + - [51, 667.743] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [46, 1032.23] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [51, 1436.41] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [54, 1731.58] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [49, 1889.15] + - - [384, 128, 1, 64, 416, 416, 96, 128] + - [34, 528.96] + - - [384, 128, 1, 128, 416, 416, 160, 160] + - [9, 963.175] + - - [384, 128, 1, 256, 416, 416, 288, 288] + - [46, 1629.07] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [54, 2392.42] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [54, 3126.97] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [49, 3645.11] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [49, 3860.23] + - - [384, 256, 1, 64, 416, 416, 96, 256] + - [21, 1173.13] + - - [384, 256, 1, 128, 416, 416, 160, 256] + - [18, 2074.33] + - - [384, 256, 1, 256, 416, 416, 288, 288] + - [7, 3346.53] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [51, 4865.31] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [54, 6195.06] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [51, 7306.1] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [54, 7816.39] + - - [384, 512, 1, 64, 416, 416, 96, 512] + - [7, 2269.24] + - - [384, 512, 1, 128, 416, 416, 160, 512] + - [18, 4000.31] + - - [384, 512, 1, 256, 416, 416, 288, 512] + - [9, 6496.92] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [46, 9208.13] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [49, 12101.9] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [54, 14474.6] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [54, 15549.2] + - - [384, 1024, 1, 64, 416, 416, 96, 1024] + - [12, 3905.92] + - - [384, 1024, 1, 128, 416, 416, 160, 1024] + - [32, 6686.81] + - - [384, 1024, 1, 256, 416, 416, 288, 1024] + - [17, 11179.9] + - - [384, 1024, 1, 512, 416, 416, 544, 1024] + - [16, 16662.0] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [18, 22290.4] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [17, 26651.7] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [18, 28790.8] + - - [384, 2048, 1, 64, 416, 416, 96, 2048] + - [34, 6469.36] + - - [384, 2048, 1, 128, 416, 416, 160, 2048] + - [35, 12217.9] + - - [384, 2048, 1, 256, 416, 416, 288, 2048] + - [37, 18670.8] + - - [384, 2048, 1, 512, 416, 416, 544, 2048] + - [20, 27522.4] + - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] + - [22, 34049.6] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [26, 36463.1] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [39, 36406.3] + - - [384, 4096, 1, 64, 416, 416, 96, 4096] + - [16, 12657.3] + - - [384, 4096, 1, 128, 416, 416, 160, 4096] + - [3, 19315.6] + - - [384, 4096, 1, 256, 416, 416, 288, 4096] + - [14, 26391.4] + - - [384, 4096, 1, 512, 416, 416, 544, 4096] + - [4, 31297.2] + - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] + - [14, 33942.0] + - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] + - [13, 36838.9] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [5, 37450.5] + - - [768, 64, 1, 64, 800, 800, 96, 96] + - [16, 485.752] + - - [768, 64, 1, 128, 800, 800, 160, 160] + - [9, 820.161] + - - [768, 64, 1, 256, 800, 800, 288, 288] + - [9, 1376.23] + - - [768, 64, 1, 512, 800, 800, 544, 544] + - [46, 2110.52] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [54, 2862.03] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [49, 3506.33] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [49, 3781.92] + - - [768, 128, 1, 64, 800, 800, 96, 128] + - [18, 1078.23] + - - [768, 128, 1, 128, 800, 800, 160, 160] + - [18, 1967.0] + - - [768, 128, 1, 256, 800, 800, 288, 288] + - [7, 3195.66] + - - [768, 128, 1, 512, 800, 800, 544, 544] + - [58, 4725.98] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [49, 6249.28] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [54, 7295.51] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [49, 7735.16] + - - [768, 256, 1, 64, 800, 800, 96, 256] + - [18, 2141.41] + - - [768, 256, 1, 128, 800, 800, 160, 256] + - [9, 3815.89] + - - [768, 256, 1, 256, 800, 800, 288, 288] + - [18, 6442.04] + - - [768, 256, 1, 512, 800, 800, 544, 544] + - [18, 9399.0] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [9, 12125.2] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [54, 14413.4] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [54, 15420.0] + - - [768, 512, 1, 64, 800, 800, 96, 512] + - [37, 3905.31] + - - [768, 512, 1, 128, 800, 800, 160, 512] + - [34, 6931.78] + - - [768, 512, 1, 256, 800, 800, 288, 512] + - [18, 11536.0] + - - [768, 512, 1, 512, 800, 800, 544, 544] + - [17, 16956.7] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [6, 21543.8] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [18, 26583.1] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [18, 28557.0] + - - [768, 1024, 1, 64, 800, 800, 96, 1024] + - [34, 6702.84] + - - [768, 1024, 1, 128, 800, 800, 160, 1024] + - [34, 11594.5] + - - [768, 1024, 1, 256, 800, 800, 288, 1024] + - [23, 19451.8] + - - [768, 1024, 1, 512, 800, 800, 544, 1024] + - [30, 26711.8] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [38, 32086.5] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [26, 35618.1] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [30, 39787.9] + - - [768, 2048, 1, 64, 800, 800, 96, 2048] + - [14, 12965.4] + - - [768, 2048, 1, 128, 800, 800, 160, 2048] + - [44, 20042.6] + - - [768, 2048, 1, 256, 800, 800, 288, 2048] + - [32, 27501.8] + - - [768, 2048, 1, 512, 800, 800, 544, 2048] + - [28, 33332.3] + - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] + - [44, 36359.4] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [21, 40073.0] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [44, 41485.0] + - - [768, 4096, 1, 64, 800, 800, 96, 4096] + - [34, 17333.4] + - - [768, 4096, 1, 128, 800, 800, 160, 4096] + - [34, 25453.9] + - - [768, 4096, 1, 256, 800, 800, 288, 4096] + - [21, 32512.7] + - - [768, 4096, 1, 512, 800, 800, 544, 4096] + - [34, 35809.7] + - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] + - [44, 40149.9] + - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] + - [41, 42542.7] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [34, 42891.8] + - - [1536, 64, 1, 64, 1568, 1568, 96, 96] + - [0, 975.272] + - - [1536, 64, 1, 128, 1568, 1568, 160, 160] + - [7, 1632.24] + - - [1536, 64, 1, 256, 1568, 1568, 288, 288] + - [54, 2743.77] + - - [1536, 64, 1, 512, 1568, 1568, 544, 544] + - [54, 4194.31] + - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] + - [59, 5595.21] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [54, 7021.23] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [48, 7699.95] + - - [1536, 128, 1, 64, 1568, 1568, 96, 128] + - [12, 2143.61] + - - [1536, 128, 1, 128, 1568, 1568, 160, 160] + - [9, 3924.82] + - - [1536, 128, 1, 256, 1568, 1568, 288, 288] + - [18, 6387.27] + - - [1536, 128, 1, 512, 1568, 1568, 544, 544] + - [9, 9313.78] + - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] + - [54, 11926.9] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [46, 14667.0] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [46, 15682.7] + - - [1536, 256, 1, 64, 1568, 1568, 96, 256] + - [18, 3721.1] + - - [1536, 256, 1, 128, 1568, 1568, 160, 256] + - [6, 6660.29] + - - [1536, 256, 1, 256, 1568, 1568, 288, 288] + - [16, 11092.4] + - - [1536, 256, 1, 512, 1568, 1568, 544, 544] + - [15, 16444.3] + - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] + - [44, 21416.6] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [44, 25685.2] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [28, 27191.2] + - - [1536, 512, 1, 64, 1568, 1568, 96, 512] + - [44, 6457.74] + - - [1536, 512, 1, 128, 1568, 1568, 160, 512] + - [58, 12238.7] + - - [1536, 512, 1, 256, 1568, 1568, 288, 512] + - [26, 19137.5] + - - [1536, 512, 1, 512, 1568, 1568, 544, 544] + - [43, 27369.1] + - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] + - [38, 33599.3] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [30, 36802.3] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [29, 40257.3] + - - [1536, 1024, 1, 64, 1568, 1568, 96, 1024] + - [13, 12478.4] + - - [1536, 1024, 1, 128, 1568, 1568, 160, 1024] + - [41, 19616.7] + - - [1536, 1024, 1, 256, 1568, 1568, 288, 1024] + - [28, 27098.3] + - - [1536, 1024, 1, 512, 1568, 1568, 544, 1024] + - [44, 34226.2] + - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] + - [21, 36458.2] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [41, 40071.5] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [28, 41980.3] + - - [1536, 2048, 1, 64, 1568, 1568, 96, 2048] + - [44, 17843.4] + - - [1536, 2048, 1, 128, 1568, 1568, 160, 2048] + - [44, 25563.7] + - - [1536, 2048, 1, 256, 1568, 1568, 288, 2048] + - [41, 32223.9] + - - [1536, 2048, 1, 512, 1568, 1568, 544, 2048] + - [41, 36222.1] + - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 2048] + - [41, 40045.6] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [41, 42563.0] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [44, 43059.2] + - - [1536, 4096, 1, 64, 1568, 1568, 96, 4096] + - [41, 21679.5] + - - [1536, 4096, 1, 128, 1568, 1568, 160, 4096] + - [34, 29599.3] + - - [1536, 4096, 1, 256, 1568, 1568, 288, 4096] + - [34, 33815.1] + - - [1536, 4096, 1, 512, 1568, 1568, 544, 4096] + - [44, 38645.2] + - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 4096] + - [41, 41823.0] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] + - [44, 42770.2] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [36, 43247.3] + - - [3072, 64, 1, 64, 3104, 3104, 96, 96] + - [15, 1879.73] + - - [3072, 64, 1, 128, 3104, 3104, 160, 160] + - [31, 3021.83] + - - [3072, 64, 1, 256, 3104, 3104, 288, 288] + - [15, 5104.11] + - - [3072, 64, 1, 512, 3104, 3104, 544, 544] + - [8, 7937.49] + - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] + - [15, 10948.2] + - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] + - [7, 12625.9] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [9, 12220.3] + - - [3072, 128, 1, 64, 3104, 3104, 96, 128] + - [44, 3658.89] + - - [3072, 128, 1, 128, 3104, 3104, 160, 160] + - [16, 6709.1] + - - [3072, 128, 1, 256, 3104, 3104, 288, 288] + - [15, 10666.9] + - - [3072, 128, 1, 512, 3104, 3104, 544, 544] + - [16, 16065.0] + - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] + - [6, 21593.5] + - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] + - [44, 25465.9] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [27, 22539.3] + - - [3072, 256, 1, 64, 3104, 3104, 96, 256] + - [24, 6864.65] + - - [3072, 256, 1, 128, 3104, 3104, 160, 256] + - [60, 11658.9] + - - [3072, 256, 1, 256, 3104, 3104, 288, 288] + - [41, 18509.4] + - - [3072, 256, 1, 512, 3104, 3104, 544, 544] + - [38, 26204.2] + - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] + - [26, 34467.8] + - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] + - [42, 37075.9] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [42, 35658.3] + - - [3072, 512, 1, 64, 3104, 3104, 96, 512] + - [4, 12361.9] + - - [3072, 512, 1, 128, 3104, 3104, 160, 512] + - [5, 19468.8] + - - [3072, 512, 1, 256, 3104, 3104, 288, 512] + - [28, 27140.3] + - - [3072, 512, 1, 512, 3104, 3104, 544, 544] + - [37, 33603.5] + - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] + - [44, 36666.5] + - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] + - [44, 39971.6] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [32, 40473.6] + - - [3072, 1024, 1, 64, 3104, 3104, 96, 1024] + - [34, 17348.3] + - - [3072, 1024, 1, 128, 3104, 3104, 160, 1024] + - [34, 25067.2] + - - [3072, 1024, 1, 256, 3104, 3104, 288, 1024] + - [44, 32251.0] + - - [3072, 1024, 1, 512, 3104, 3104, 544, 1024] + - [37, 35884.7] + - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] + - [44, 40045.1] + - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] + - [44, 42618.2] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [44, 42768.5] + - - [3072, 2048, 1, 64, 3104, 3104, 96, 2048] + - [44, 21665.5] + - - [3072, 2048, 1, 128, 3104, 3104, 160, 2048] + - [37, 29264.8] + - - [3072, 2048, 1, 256, 3104, 3104, 288, 2048] + - [34, 34128.2] + - - [3072, 2048, 1, 512, 3104, 3104, 544, 2048] + - [41, 38685.1] + - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 2048] + - [44, 41801.3] + - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] + - [44, 42907.2] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [41, 42739.0] + - - [3072, 4096, 1, 64, 3104, 3104, 96, 4096] + - [34, 24917.5] + - - [3072, 4096, 1, 128, 3104, 3104, 160, 4096] + - [41, 30784.5] + - - [3072, 4096, 1, 256, 3104, 3104, 288, 4096] + - [41, 36354.5] + - - [3072, 4096, 1, 512, 3104, 3104, 544, 4096] + - [41, 40252.5] + - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 4096] + - [37, 41771.8] + - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 4096] + - [40, 42839.0] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [44, 41231.2] + - - [4096, 64, 1, 64, 4128, 4128, 96, 96] + - [1, 2395.03] + - - [4096, 64, 1, 128, 4128, 4128, 160, 160] + - [24, 4100.52] + - - [4096, 64, 1, 256, 4128, 4128, 288, 288] + - [33, 6805.48] + - - [4096, 64, 1, 512, 4128, 4128, 544, 544] + - [33, 10446.6] + - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] + - [27, 13849.0] + - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] + - [40, 16299.5] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [57, 14729.2] + - - [4096, 128, 1, 64, 4128, 4128, 96, 128] + - [37, 5800.27] + - - [4096, 128, 1, 128, 4128, 4128, 160, 160] + - [25, 10186.6] + - - [4096, 128, 1, 256, 4128, 4128, 288, 288] + - [34, 16292.6] + - - [4096, 128, 1, 512, 4128, 4128, 544, 544] + - [37, 22693.0] + - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] + - [37, 29239.8] + - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] + - [25, 34083.8] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [59, 28949.3] + - - [4096, 256, 1, 64, 4128, 4128, 96, 256] + - [37, 9727.33] + - - [4096, 256, 1, 128, 4128, 4128, 160, 256] + - [41, 16118.4] + - - [4096, 256, 1, 256, 4128, 4128, 288, 288] + - [34, 23625.8] + - - [4096, 256, 1, 512, 4128, 4128, 544, 544] + - [37, 30986.4] + - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] + - [28, 37067.8] + - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] + - [32, 38750.7] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [28, 37900.3] + - - [4096, 512, 1, 64, 4128, 4128, 96, 512] + - [41, 14441.3] + - - [4096, 512, 1, 128, 4128, 4128, 160, 512] + - [44, 21972.3] + - - [4096, 512, 1, 256, 4128, 4128, 288, 512] + - [34, 29578.1] + - - [4096, 512, 1, 512, 4128, 4128, 544, 544] + - [28, 36156.6] + - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] + - [41, 38337.7] + - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] + - [41, 41676.9] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [28, 41748.0] + - - [4096, 1024, 1, 64, 4128, 4128, 96, 1024] + - [34, 19134.4] + - - [4096, 1024, 1, 128, 4128, 4128, 160, 1024] + - [41, 27035.5] + - - [4096, 1024, 1, 256, 4128, 4128, 288, 1024] + - [37, 33723.1] + - - [4096, 1024, 1, 512, 4128, 4128, 544, 1024] + - [41, 37471.4] + - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] + - [41, 40968.1] + - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] + - [28, 43023.1] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [34, 42660.1] + - - [4096, 2048, 1, 64, 4128, 4128, 96, 2048] + - [34, 23126.1] + - - [4096, 2048, 1, 128, 4128, 4128, 160, 2048] + - [44, 30466.0] + - - [4096, 2048, 1, 256, 4128, 4128, 288, 2048] + - [41, 35304.8] + - - [4096, 2048, 1, 512, 4128, 4128, 544, 2048] + - [44, 39559.8] + - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 2048] + - [41, 42293.5] + - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] + - [44, 42711.1] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [44, 43141.1] + - - [4096, 4096, 1, 64, 4128, 4128, 96, 4096] + - [44, 23646.6] + - - [4096, 4096, 1, 128, 4128, 4128, 160, 4096] + - [37, 30074.7] + - - [4096, 4096, 1, 256, 4128, 4128, 288, 4096] + - [13, 34815.7] + - - [4096, 4096, 1, 512, 4128, 4128, 544, 4096] + - [13, 39747.8] + - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 4096] + - [44, 41444.8] + - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 4096] + - [37, 42986.9] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [60, 40108.5] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_I8II_BH.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_I8II_BH.yaml new file mode 100644 index 00000000000..9dc2e0a7f20 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_I8II_BH.yaml @@ -0,0 +1,26493 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8448 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 20736 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [2, 36.4343] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [7, 61.0845] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [84, 105.068] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [39, 168.473] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [39, 233.458] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [56, 295.885] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [84, 327.309] + - - [64, 128, 1, 64, 96, 96, 96, 128] + - [16, 64.251] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [16, 117.95] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [39, 204.6] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [39, 335.303] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [38, 472.811] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [49, 595.254] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [65, 673.054] + - - [64, 256, 1, 64, 96, 96, 96, 256] + - [23, 152.078] + - - [64, 256, 1, 128, 96, 96, 160, 256] + - [39, 278.581] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [38, 457.943] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [66, 713.865] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [48, 993.322] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [85, 1234.03] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [36, 1328.6] + - - [64, 512, 1, 64, 96, 96, 96, 512] + - [18, 332.409] + - - [64, 512, 1, 128, 96, 96, 160, 512] + - [38, 601.852] + - - [64, 512, 1, 256, 96, 96, 288, 512] + - [83, 1013.61] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [36, 1501.45] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [65, 2095.58] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [36, 2525.27] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [36, 2673.82] + - - [64, 1024, 1, 64, 96, 96, 96, 1024] + - [9, 677.377] + - - [64, 1024, 1, 128, 96, 96, 160, 1024] + - [38, 1240.92] + - - [64, 1024, 1, 256, 96, 96, 288, 1024] + - [36, 2085.68] + - - [64, 1024, 1, 512, 96, 96, 544, 1024] + - [83, 3155.39] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [73, 4164.9] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [83, 5128.3] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [73, 5361.96] + - - [64, 2048, 1, 64, 96, 96, 96, 2048] + - [73, 1288.37] + - - [64, 2048, 1, 128, 96, 96, 160, 2048] + - [40, 2315.38] + - - [64, 2048, 1, 256, 96, 96, 288, 2048] + - [73, 3955.96] + - - [64, 2048, 1, 512, 96, 96, 544, 2048] + - [36, 6077.05] + - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] + - [73, 8239.27] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [36, 10096.1] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [65, 11119.3] + - - [64, 4096, 1, 64, 96, 96, 96, 4096] + - [27, 2196.84] + - - [64, 4096, 1, 128, 96, 96, 160, 4096] + - [25, 3963.43] + - - [64, 4096, 1, 256, 96, 96, 288, 4096] + - [9, 6707.53] + - - [64, 4096, 1, 512, 96, 96, 544, 4096] + - [20, 9958.28] + - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] + - [31, 13923.7] + - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] + - [15, 16947.8] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [6, 18577.2] + - - [128, 64, 1, 64, 160, 160, 96, 96] + - [45, 65.8986] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [6, 122.084] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [6, 203.924] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [87, 323.71] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [41, 466.786] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [40, 585.368] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [58, 666.477] + - - [128, 128, 1, 64, 160, 160, 96, 128] + - [28, 181.857] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [8, 315.456] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [39, 517.304] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [77, 802.816] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [59, 1046.42] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [76, 1282.07] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [51, 1338.97] + - - [128, 256, 1, 64, 160, 160, 96, 256] + - [39, 392.138] + - - [128, 256, 1, 128, 160, 160, 160, 256] + - [39, 691.901] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [85, 1136.05] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [69, 1683.28] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [88, 2160.48] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [69, 2618.27] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [51, 2759.92] + - - [128, 512, 1, 64, 160, 160, 96, 512] + - [39, 799.376] + - - [128, 512, 1, 128, 160, 160, 160, 512] + - [38, 1419.4] + - - [128, 512, 1, 256, 160, 160, 288, 512] + - [58, 2233.09] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [51, 3434.44] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [51, 4356.59] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [59, 5244.93] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [58, 5382.06] + - - [128, 1024, 1, 64, 160, 160, 96, 1024] + - [39, 1395.54] + - - [128, 1024, 1, 128, 160, 160, 160, 1024] + - [74, 2626.36] + - - [128, 1024, 1, 256, 160, 160, 288, 1024] + - [58, 4353.2] + - - [128, 1024, 1, 512, 160, 160, 544, 1024] + - [41, 6514.17] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [88, 8419.13] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [58, 10179.2] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [59, 11117.0] + - - [128, 2048, 1, 64, 160, 160, 96, 2048] + - [39, 2496.98] + - - [128, 2048, 1, 128, 160, 160, 160, 2048] + - [57, 4718.67] + - - [128, 2048, 1, 256, 160, 160, 288, 2048] + - [39, 7982.5] + - - [128, 2048, 1, 512, 160, 160, 544, 2048] + - [41, 11752.9] + - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] + - [88, 16233.4] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [36, 19544.6] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [59, 21654.6] + - - [128, 4096, 1, 64, 160, 160, 96, 4096] + - [38, 5001.44] + - - [128, 4096, 1, 128, 160, 160, 160, 4096] + - [27, 8921.68] + - - [128, 4096, 1, 256, 160, 160, 288, 4096] + - [79, 15829.4] + - - [128, 4096, 1, 512, 160, 160, 544, 4096] + - [24, 23407.3] + - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] + - [21, 31119.4] + - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] + - [79, 35971.3] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [5, 37787.9] + - - [256, 64, 1, 64, 288, 288, 96, 96] + - [1, 155.115] + - - [256, 64, 1, 128, 288, 288, 160, 160] + - [37, 276.305] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [36, 470.69] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [87, 736.618] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [76, 1003.78] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [58, 1221.32] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [41, 1317.31] + - - [256, 128, 1, 64, 288, 288, 96, 128] + - [38, 388.006] + - - [256, 128, 1, 128, 288, 288, 160, 160] + - [27, 685.68] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [76, 1130.84] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [51, 1695.7] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [88, 2208.55] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [58, 2580.72] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [87, 2733.01] + - - [256, 256, 1, 64, 288, 288, 96, 256] + - [58, 791.83] + - - [256, 256, 1, 128, 288, 288, 160, 256] + - [6, 1378.57] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [41, 2204.05] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [50, 3314.35] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [50, 4472.43] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [69, 5192.58] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [68, 5359.49] + - - [256, 512, 1, 64, 288, 288, 96, 512] + - [17, 1435.18] + - - [256, 512, 1, 128, 288, 288, 160, 512] + - [67, 2498.84] + - - [256, 512, 1, 256, 288, 288, 288, 512] + - [77, 4343.62] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [50, 6528.73] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [87, 8487.27] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [69, 10229.6] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [40, 11111.4] + - - [256, 1024, 1, 64, 288, 288, 96, 1024] + - [57, 2614.09] + - - [256, 1024, 1, 128, 288, 288, 160, 1024] + - [44, 4717.36] + - - [256, 1024, 1, 256, 288, 288, 288, 1024] + - [58, 7728.76] + - - [256, 1024, 1, 512, 288, 288, 544, 1024] + - [50, 12248.4] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [69, 16189.3] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [58, 19777.2] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [40, 21984.4] + - - [256, 2048, 1, 64, 288, 288, 96, 2048] + - [60, 5004.41] + - - [256, 2048, 1, 128, 288, 288, 160, 2048] + - [15, 8882.71] + - - [256, 2048, 1, 256, 288, 288, 288, 2048] + - [6, 14686.3] + - - [256, 2048, 1, 512, 288, 288, 544, 2048] + - [21, 23356.5] + - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] + - [78, 30780.4] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [6, 35913.5] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [31, 37662.0] + - - [256, 4096, 1, 64, 288, 288, 96, 4096] + - [50, 9097.04] + - - [256, 4096, 1, 128, 288, 288, 160, 4096] + - [85, 15353.2] + - - [256, 4096, 1, 256, 288, 288, 288, 4096] + - [63, 22876.8] + - - [256, 4096, 1, 512, 288, 288, 544, 4096] + - [81, 30093.7] + - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] + - [55, 35581.5] + - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] + - [63, 37636.2] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [93, 40145.2] + - - [384, 64, 1, 64, 416, 416, 96, 96] + - [0, 237.557] + - - [384, 64, 1, 128, 416, 416, 160, 160] + - [67, 427.525] + - - [384, 64, 1, 256, 416, 416, 288, 288] + - [68, 727.59] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [68, 1096.26] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [87, 1486.9] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [76, 1850.63] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [59, 1979.19] + - - [384, 128, 1, 64, 416, 416, 96, 128] + - [41, 594.993] + - - [384, 128, 1, 128, 416, 416, 160, 160] + - [39, 1042.32] + - - [384, 128, 1, 256, 416, 416, 288, 288] + - [6, 1692.39] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [51, 2483.31] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [88, 3348.08] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [76, 3905.62] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [40, 4033.15] + - - [384, 256, 1, 64, 416, 416, 96, 256] + - [76, 1156.52] + - - [384, 256, 1, 128, 416, 416, 160, 256] + - [39, 2067.52] + - - [384, 256, 1, 256, 416, 416, 288, 288] + - [41, 3255.18] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [51, 4899.89] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [50, 6593.95] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [88, 7707.46] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [88, 8029.62] + - - [384, 512, 1, 64, 416, 416, 96, 512] + - [86, 2072.28] + - - [384, 512, 1, 128, 416, 416, 160, 512] + - [85, 3562.05] + - - [384, 512, 1, 256, 416, 416, 288, 512] + - [51, 6016.21] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [88, 9195.51] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [51, 12683.6] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [87, 15155.6] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [88, 16740.3] + - - [384, 1024, 1, 64, 416, 416, 96, 1024] + - [30, 3623.07] + - - [384, 1024, 1, 128, 416, 416, 160, 1024] + - [28, 6477.69] + - - [384, 1024, 1, 256, 416, 416, 288, 1024] + - [90, 11132.9] + - - [384, 1024, 1, 512, 416, 416, 544, 1024] + - [32, 17228.0] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [26, 22142.1] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [34, 26662.3] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [90, 28382.4] + - - [384, 2048, 1, 64, 416, 416, 96, 2048] + - [77, 8056.93] + - - [384, 2048, 1, 128, 416, 416, 160, 2048] + - [75, 13508.3] + - - [384, 2048, 1, 256, 416, 416, 288, 2048] + - [81, 19806.0] + - - [384, 2048, 1, 512, 416, 416, 544, 2048] + - [81, 28038.0] + - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] + - [81, 33376.4] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [93, 36319.3] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [93, 38891.5] + - - [384, 4096, 1, 64, 416, 416, 96, 4096] + - [74, 11956.7] + - - [384, 4096, 1, 128, 416, 416, 160, 4096] + - [73, 19519.7] + - - [384, 4096, 1, 256, 416, 416, 288, 4096] + - [85, 26332.7] + - - [384, 4096, 1, 512, 416, 416, 544, 4096] + - [13, 31788.8] + - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] + - [74, 34219.6] + - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] + - [73, 37183.3] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [41, 38906.3] + - - [768, 64, 1, 64, 800, 800, 96, 96] + - [3, 518.842] + - - [768, 64, 1, 128, 800, 800, 160, 160] + - [36, 858.785] + - - [768, 64, 1, 256, 800, 800, 288, 288] + - [36, 1516.56] + - - [768, 64, 1, 512, 800, 800, 544, 544] + - [76, 2285.93] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [59, 3034.95] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [40, 3732.01] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [58, 3974.32] + - - [768, 128, 1, 64, 800, 800, 96, 128] + - [76, 1141.62] + - - [768, 128, 1, 128, 800, 800, 160, 160] + - [38, 2028.19] + - - [768, 128, 1, 256, 800, 800, 288, 288] + - [38, 3342.07] + - - [768, 128, 1, 512, 800, 800, 544, 544] + - [50, 4851.24] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [77, 6556.16] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [59, 7680.13] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [41, 7994.71] + - - [768, 256, 1, 64, 800, 800, 96, 256] + - [85, 2034.42] + - - [768, 256, 1, 128, 800, 800, 160, 256] + - [40, 3648.81] + - - [768, 256, 1, 256, 800, 800, 288, 288] + - [50, 5934.64] + - - [768, 256, 1, 512, 800, 800, 544, 544] + - [50, 9104.88] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [50, 12528.1] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [76, 14913.6] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [76, 16691.3] + - - [768, 512, 1, 64, 800, 800, 96, 512] + - [19, 3772.42] + - - [768, 512, 1, 128, 800, 800, 160, 512] + - [85, 6467.7] + - - [768, 512, 1, 256, 800, 800, 288, 512] + - [20, 10793.8] + - - [768, 512, 1, 512, 800, 800, 544, 544] + - [70, 17176.6] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [15, 22504.6] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [31, 27179.7] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [31, 28915.4] + - - [768, 1024, 1, 64, 800, 800, 96, 1024] + - [84, 8033.78] + - - [768, 1024, 1, 128, 800, 800, 160, 1024] + - [29, 13200.0] + - - [768, 1024, 1, 256, 800, 800, 288, 1024] + - [81, 19763.1] + - - [768, 1024, 1, 512, 800, 800, 544, 1024] + - [82, 27977.6] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [93, 33266.1] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [93, 35939.2] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [93, 39032.4] + - - [768, 2048, 1, 64, 800, 800, 96, 2048] + - [73, 12340.7] + - - [768, 2048, 1, 128, 800, 800, 160, 2048] + - [13, 19012.8] + - - [768, 2048, 1, 256, 800, 800, 288, 2048] + - [94, 26329.3] + - - [768, 2048, 1, 512, 800, 800, 544, 2048] + - [81, 32810.8] + - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] + - [93, 35496.4] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [46, 38748.8] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [94, 40602.6] + - - [768, 4096, 1, 64, 800, 800, 96, 4096] + - [92, 16915.4] + - - [768, 4096, 1, 128, 800, 800, 160, 4096] + - [83, 24720.8] + - - [768, 4096, 1, 256, 800, 800, 288, 4096] + - [63, 31396.0] + - - [768, 4096, 1, 512, 800, 800, 544, 4096] + - [46, 35174.7] + - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] + - [63, 39080.7] + - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] + - [81, 41196.1] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [93, 41732.1] + - - [1536, 64, 1, 64, 1568, 1568, 96, 96] + - [0, 961.555] + - - [1536, 64, 1, 128, 1568, 1568, 160, 160] + - [85, 1732.47] + - - [1536, 64, 1, 256, 1568, 1568, 288, 288] + - [87, 2935.82] + - - [1536, 64, 1, 512, 1568, 1568, 544, 544] + - [68, 4522.97] + - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] + - [58, 6027.74] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [58, 7438.1] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [50, 7991.38] + - - [1536, 128, 1, 64, 1568, 1568, 96, 128] + - [41, 1995.39] + - - [1536, 128, 1, 128, 1568, 1568, 160, 160] + - [38, 3640.9] + - - [1536, 128, 1, 256, 1568, 1568, 288, 288] + - [87, 6077.99] + - - [1536, 128, 1, 512, 1568, 1568, 544, 544] + - [50, 9274.33] + - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] + - [58, 12202.4] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [41, 14979.7] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [41, 16861.9] + - - [1536, 256, 1, 64, 1568, 1568, 96, 256] + - [36, 3744.36] + - - [1536, 256, 1, 128, 1568, 1568, 160, 256] + - [60, 6428.89] + - - [1536, 256, 1, 256, 1568, 1568, 288, 288] + - [89, 11314.4] + - - [1536, 256, 1, 512, 1568, 1568, 544, 544] + - [32, 16829.1] + - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] + - [42, 22465.7] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [4, 26640.2] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [52, 28402.9] + - - [1536, 512, 1, 64, 1568, 1568, 96, 512] + - [87, 7699.5] + - - [1536, 512, 1, 128, 1568, 1568, 160, 512] + - [75, 12940.4] + - - [1536, 512, 1, 256, 1568, 1568, 288, 512] + - [72, 20375.1] + - - [1536, 512, 1, 512, 1568, 1568, 544, 544] + - [93, 27291.2] + - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] + - [63, 33830.8] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [63, 36211.5] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [93, 38960.7] + - - [1536, 1024, 1, 64, 1568, 1568, 96, 1024] + - [83, 11834.4] + - - [1536, 1024, 1, 128, 1568, 1568, 160, 1024] + - [57, 19533.0] + - - [1536, 1024, 1, 256, 1568, 1568, 288, 1024] + - [22, 25987.7] + - - [1536, 1024, 1, 512, 1568, 1568, 544, 1024] + - [81, 33271.7] + - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] + - [63, 35717.6] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [63, 38785.7] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [94, 40624.1] + - - [1536, 2048, 1, 64, 1568, 1568, 96, 2048] + - [92, 17182.4] + - - [1536, 2048, 1, 128, 1568, 1568, 160, 2048] + - [75, 25107.8] + - - [1536, 2048, 1, 256, 1568, 1568, 288, 2048] + - [63, 31219.5] + - - [1536, 2048, 1, 512, 1568, 1568, 544, 2048] + - [81, 35398.9] + - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 2048] + - [81, 39111.6] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [81, 41169.6] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [93, 41751.3] + - - [1536, 4096, 1, 64, 1568, 1568, 96, 4096] + - [91, 22034.3] + - - [1536, 4096, 1, 128, 1568, 1568, 160, 4096] + - [85, 29172.5] + - - [1536, 4096, 1, 256, 1568, 1568, 288, 4096] + - [93, 33740.7] + - - [1536, 4096, 1, 512, 1568, 1568, 544, 4096] + - [93, 38246.9] + - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 4096] + - [63, 40816.9] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] + - [94, 41277.3] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [46, 41750.4] + - - [3072, 64, 1, 64, 3104, 3104, 96, 96] + - [35, 1578.0] + - - [3072, 64, 1, 128, 3104, 3104, 160, 160] + - [4, 2931.37] + - - [3072, 64, 1, 256, 3104, 3104, 288, 288] + - [4, 4832.16] + - - [3072, 64, 1, 512, 3104, 3104, 544, 544] + - [27, 7409.34] + - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] + - [11, 10308.1] + - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] + - [78, 12814.4] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [42, 13927.1] + - - [3072, 128, 1, 64, 3104, 3104, 96, 128] + - [36, 3979.43] + - - [3072, 128, 1, 128, 3104, 3104, 160, 160] + - [43, 6292.26] + - - [3072, 128, 1, 256, 3104, 3104, 288, 288] + - [31, 11131.6] + - - [3072, 128, 1, 512, 3104, 3104, 544, 544] + - [6, 16619.3] + - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] + - [60, 22457.0] + - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] + - [4, 26709.1] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [4, 28908.6] + - - [3072, 256, 1, 64, 3104, 3104, 96, 256] + - [58, 7641.06] + - - [3072, 256, 1, 128, 3104, 3104, 160, 256] + - [61, 13427.1] + - - [3072, 256, 1, 256, 3104, 3104, 288, 288] + - [63, 20288.9] + - - [3072, 256, 1, 512, 3104, 3104, 544, 544] + - [64, 27274.5] + - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] + - [63, 33859.2] + - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] + - [47, 36139.2] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [72, 38961.1] + - - [3072, 512, 1, 64, 3104, 3104, 96, 512] + - [65, 11867.9] + - - [3072, 512, 1, 128, 3104, 3104, 160, 512] + - [83, 19514.1] + - - [3072, 512, 1, 256, 3104, 3104, 288, 512] + - [63, 25869.2] + - - [3072, 512, 1, 512, 3104, 3104, 544, 544] + - [63, 33237.3] + - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] + - [63, 35725.5] + - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] + - [64, 38951.2] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [72, 40659.2] + - - [3072, 1024, 1, 64, 3104, 3104, 96, 1024] + - [80, 17518.9] + - - [3072, 1024, 1, 128, 3104, 3104, 160, 1024] + - [57, 24552.1] + - - [3072, 1024, 1, 256, 3104, 3104, 288, 1024] + - [81, 31895.9] + - - [3072, 1024, 1, 512, 3104, 3104, 544, 1024] + - [81, 35489.4] + - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] + - [81, 39129.6] + - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] + - [81, 41200.3] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [93, 41732.1] + - - [3072, 2048, 1, 64, 3104, 3104, 96, 2048] + - [80, 21727.5] + - - [3072, 2048, 1, 128, 3104, 3104, 160, 2048] + - [75, 29237.1] + - - [3072, 2048, 1, 256, 3104, 3104, 288, 2048] + - [93, 33840.7] + - - [3072, 2048, 1, 512, 3104, 3104, 544, 2048] + - [81, 38302.4] + - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 2048] + - [63, 40853.4] + - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] + - [81, 41506.4] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [63, 41389.9] + - - [3072, 4096, 1, 64, 3104, 3104, 96, 4096] + - [45, 6763.2] + - - [3072, 4096, 1, 128, 3104, 3104, 160, 4096] + - [53, 12835.5] + - - [3072, 4096, 1, 256, 3104, 3104, 288, 4096] + - [45, 23148.5] + - - [3072, 4096, 1, 512, 3104, 3104, 544, 4096] + - [92, 35655.0] + - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 4096] + - [93, 39466.7] + - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 4096] + - [64, 41295.0] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [81, 41849.2] + - - [4096, 64, 1, 64, 4128, 4128, 96, 96] + - [78, 1791.1] + - - [4096, 64, 1, 128, 4128, 4128, 160, 160] + - [10, 3576.09] + - - [4096, 64, 1, 256, 4128, 4128, 288, 288] + - [14, 5959.42] + - - [4096, 64, 1, 512, 4128, 4128, 544, 544] + - [14, 9558.99] + - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] + - [33, 13081.7] + - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] + - [12, 16531.3] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [11, 18058.3] + - - [4096, 128, 1, 64, 4128, 4128, 96, 128] + - [68, 5773.3] + - - [4096, 128, 1, 128, 4128, 4128, 160, 160] + - [42, 10035.7] + - - [4096, 128, 1, 256, 4128, 4128, 288, 288] + - [12, 16141.6] + - - [4096, 128, 1, 512, 4128, 4128, 544, 544] + - [24, 23615.3] + - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] + - [3, 30533.6] + - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] + - [42, 35860.8] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [0, 37730.8] + - - [4096, 256, 1, 64, 4128, 4128, 96, 256] + - [68, 9804.11] + - - [4096, 256, 1, 128, 4128, 4128, 160, 256] + - [62, 15429.1] + - - [4096, 256, 1, 256, 4128, 4128, 288, 288] + - [93, 22639.5] + - - [4096, 256, 1, 512, 4128, 4128, 544, 544] + - [63, 30681.9] + - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] + - [81, 35673.7] + - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] + - [63, 37833.2] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [46, 40145.6] + - - [4096, 512, 1, 64, 4128, 4128, 96, 512] + - [71, 13770.2] + - - [4096, 512, 1, 128, 4128, 4128, 160, 512] + - [75, 21377.4] + - - [4096, 512, 1, 256, 4128, 4128, 288, 512] + - [63, 28915.4] + - - [4096, 512, 1, 512, 4128, 4128, 544, 544] + - [81, 35019.8] + - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] + - [63, 37461.6] + - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] + - [81, 40218.5] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [93, 41715.8] + - - [4096, 1024, 1, 64, 4128, 4128, 96, 1024] + - [80, 18825.8] + - - [4096, 1024, 1, 128, 4128, 4128, 160, 1024] + - [75, 27206.8] + - - [4096, 1024, 1, 256, 4128, 4128, 288, 1024] + - [81, 33398.9] + - - [4096, 1024, 1, 512, 4128, 4128, 544, 1024] + - [63, 36738.7] + - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] + - [81, 39857.6] + - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] + - [81, 41701.6] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [63, 41326.1] + - - [4096, 2048, 1, 64, 4128, 4128, 96, 2048] + - [91, 23035.8] + - - [4096, 2048, 1, 128, 4128, 4128, 160, 2048] + - [85, 30768.9] + - - [4096, 2048, 1, 256, 4128, 4128, 288, 2048] + - [35, 34131.5] + - - [4096, 2048, 1, 512, 4128, 4128, 544, 2048] + - [88, 37340.4] + - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 2048] + - [94, 39941.3] + - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] + - [93, 40407.9] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [81, 41869.1] + - - [4096, 4096, 1, 64, 4128, 4128, 96, 4096] + - [45, 7040.61] + - - [4096, 4096, 1, 128, 4128, 4128, 160, 4096] + - [53, 13693.6] + - - [4096, 4096, 1, 256, 4128, 4128, 288, 4096] + - [71, 24216.1] + - - [4096, 4096, 1, 512, 4128, 4128, 544, 4096] + - [54, 36638.2] + - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 4096] + - [64, 38693.3] + - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 4096] + - [81, 41280.4] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [93, 41825.4] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_I8II_BH_GB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_I8II_BH_GB.yaml new file mode 100644 index 00000000000..17f9986c109 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_I8II_BH_GB.yaml @@ -0,0 +1,26493 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixpoint +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA8_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12544 + LdsNumElementsAlignedA: 2304 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2304 + LdsOffsetB_Blk: 10496 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA8_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 16 + LSPB: 2 + LVCA: 2 + LVCB: 16 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW8_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 4608 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB1_GRVW16_IU1_LPA32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 16 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 16 + LSCB: 128 + LSPA: 32 + LSPB: 4 + LVCA: 1 + LVCB: 8 + LVPA: 2 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 12800 + LdsNumElementsAlignedA: 2560 + LdsNumElementsAlignedB: 2048 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2560 + LdsOffsetB_Blk: 10752 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x16_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 16 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 4 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 8448 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 20736 + LdsPadA: 8 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA8_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 79 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 80 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 81 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 82 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 83 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 8 + LSPB: 2 + LVCA: 4 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 84 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 85 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 86 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 87 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 88 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 89 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 9216 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 90 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 91 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 92 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 93 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: false + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 128 + LSPA: 16 + LSPB: 4 + LVCA: 2 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 94 + SolutionNameMin: Cijk_Alik_Bjlk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: true + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [2, 36.4343] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [7, 61.0845] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [84, 105.068] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [39, 168.473] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [39, 233.458] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [56, 295.885] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [84, 327.309] + - - [64, 128, 1, 64, 96, 96, 96, 128] + - [16, 64.251] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [16, 117.95] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [39, 204.6] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [39, 335.303] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [38, 472.811] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [49, 595.254] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [65, 673.054] + - - [64, 256, 1, 64, 96, 96, 96, 256] + - [23, 152.078] + - - [64, 256, 1, 128, 96, 96, 160, 256] + - [39, 278.581] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [38, 457.943] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [66, 713.865] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [48, 993.322] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [85, 1234.03] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [36, 1328.6] + - - [64, 512, 1, 64, 96, 96, 96, 512] + - [18, 332.409] + - - [64, 512, 1, 128, 96, 96, 160, 512] + - [38, 601.852] + - - [64, 512, 1, 256, 96, 96, 288, 512] + - [83, 1013.61] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [36, 1501.45] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [65, 2095.58] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [36, 2525.27] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [36, 2673.82] + - - [64, 1024, 1, 64, 96, 96, 96, 1024] + - [9, 677.377] + - - [64, 1024, 1, 128, 96, 96, 160, 1024] + - [38, 1240.92] + - - [64, 1024, 1, 256, 96, 96, 288, 1024] + - [36, 2085.68] + - - [64, 1024, 1, 512, 96, 96, 544, 1024] + - [83, 3155.39] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [73, 4164.9] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [83, 5128.3] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [73, 5361.96] + - - [64, 2048, 1, 64, 96, 96, 96, 2048] + - [73, 1288.37] + - - [64, 2048, 1, 128, 96, 96, 160, 2048] + - [40, 2315.38] + - - [64, 2048, 1, 256, 96, 96, 288, 2048] + - [73, 3955.96] + - - [64, 2048, 1, 512, 96, 96, 544, 2048] + - [36, 6077.05] + - - [64, 2048, 1, 1024, 96, 96, 1056, 2048] + - [73, 8239.27] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [36, 10096.1] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [65, 11119.3] + - - [64, 4096, 1, 64, 96, 96, 96, 4096] + - [27, 2196.84] + - - [64, 4096, 1, 128, 96, 96, 160, 4096] + - [25, 3963.43] + - - [64, 4096, 1, 256, 96, 96, 288, 4096] + - [9, 6707.53] + - - [64, 4096, 1, 512, 96, 96, 544, 4096] + - [20, 9958.28] + - - [64, 4096, 1, 1024, 96, 96, 1056, 4096] + - [31, 13923.7] + - - [64, 4096, 1, 2048, 96, 96, 2080, 4096] + - [15, 16947.8] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [6, 18577.2] + - - [128, 64, 1, 64, 160, 160, 96, 96] + - [45, 65.8986] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [6, 122.084] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [6, 203.924] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [87, 323.71] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [41, 466.786] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [40, 585.368] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [58, 666.477] + - - [128, 128, 1, 64, 160, 160, 96, 128] + - [28, 181.857] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [8, 315.456] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [39, 517.304] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [77, 802.816] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [59, 1046.42] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [76, 1282.07] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [51, 1338.97] + - - [128, 256, 1, 64, 160, 160, 96, 256] + - [39, 392.138] + - - [128, 256, 1, 128, 160, 160, 160, 256] + - [39, 691.901] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [85, 1136.05] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [69, 1683.28] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [88, 2160.48] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [69, 2618.27] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [51, 2759.92] + - - [128, 512, 1, 64, 160, 160, 96, 512] + - [39, 799.376] + - - [128, 512, 1, 128, 160, 160, 160, 512] + - [38, 1419.4] + - - [128, 512, 1, 256, 160, 160, 288, 512] + - [58, 2233.09] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [51, 3434.44] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [51, 4356.59] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [59, 5244.93] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [58, 5382.06] + - - [128, 1024, 1, 64, 160, 160, 96, 1024] + - [39, 1395.54] + - - [128, 1024, 1, 128, 160, 160, 160, 1024] + - [74, 2626.36] + - - [128, 1024, 1, 256, 160, 160, 288, 1024] + - [58, 4353.2] + - - [128, 1024, 1, 512, 160, 160, 544, 1024] + - [41, 6514.17] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [88, 8419.13] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [58, 10179.2] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [59, 11117.0] + - - [128, 2048, 1, 64, 160, 160, 96, 2048] + - [39, 2496.98] + - - [128, 2048, 1, 128, 160, 160, 160, 2048] + - [57, 4718.67] + - - [128, 2048, 1, 256, 160, 160, 288, 2048] + - [39, 7982.5] + - - [128, 2048, 1, 512, 160, 160, 544, 2048] + - [41, 11752.9] + - - [128, 2048, 1, 1024, 160, 160, 1056, 2048] + - [88, 16233.4] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [36, 19544.6] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [59, 21654.6] + - - [128, 4096, 1, 64, 160, 160, 96, 4096] + - [38, 5001.44] + - - [128, 4096, 1, 128, 160, 160, 160, 4096] + - [27, 8921.68] + - - [128, 4096, 1, 256, 160, 160, 288, 4096] + - [79, 15829.4] + - - [128, 4096, 1, 512, 160, 160, 544, 4096] + - [24, 23407.3] + - - [128, 4096, 1, 1024, 160, 160, 1056, 4096] + - [21, 31119.4] + - - [128, 4096, 1, 2048, 160, 160, 2080, 4096] + - [79, 35971.3] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [5, 37787.9] + - - [256, 64, 1, 64, 288, 288, 96, 96] + - [1, 155.115] + - - [256, 64, 1, 128, 288, 288, 160, 160] + - [37, 276.305] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [36, 470.69] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [87, 736.618] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [76, 1003.78] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [58, 1221.32] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [41, 1317.31] + - - [256, 128, 1, 64, 288, 288, 96, 128] + - [38, 388.006] + - - [256, 128, 1, 128, 288, 288, 160, 160] + - [27, 685.68] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [76, 1130.84] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [51, 1695.7] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [88, 2208.55] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [58, 2580.72] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [87, 2733.01] + - - [256, 256, 1, 64, 288, 288, 96, 256] + - [58, 791.83] + - - [256, 256, 1, 128, 288, 288, 160, 256] + - [6, 1378.57] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [41, 2204.05] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [50, 3314.35] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [50, 4472.43] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [69, 5192.58] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [68, 5359.49] + - - [256, 512, 1, 64, 288, 288, 96, 512] + - [17, 1435.18] + - - [256, 512, 1, 128, 288, 288, 160, 512] + - [67, 2498.84] + - - [256, 512, 1, 256, 288, 288, 288, 512] + - [77, 4343.62] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [50, 6528.73] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [87, 8487.27] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [69, 10229.6] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [40, 11111.4] + - - [256, 1024, 1, 64, 288, 288, 96, 1024] + - [57, 2614.09] + - - [256, 1024, 1, 128, 288, 288, 160, 1024] + - [44, 4717.36] + - - [256, 1024, 1, 256, 288, 288, 288, 1024] + - [58, 7728.76] + - - [256, 1024, 1, 512, 288, 288, 544, 1024] + - [50, 12248.4] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [69, 16189.3] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [58, 19777.2] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [40, 21984.4] + - - [256, 2048, 1, 64, 288, 288, 96, 2048] + - [60, 5004.41] + - - [256, 2048, 1, 128, 288, 288, 160, 2048] + - [15, 8882.71] + - - [256, 2048, 1, 256, 288, 288, 288, 2048] + - [6, 14686.3] + - - [256, 2048, 1, 512, 288, 288, 544, 2048] + - [21, 23356.5] + - - [256, 2048, 1, 1024, 288, 288, 1056, 2048] + - [78, 30780.4] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [6, 35913.5] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [31, 37662.0] + - - [256, 4096, 1, 64, 288, 288, 96, 4096] + - [50, 9097.04] + - - [256, 4096, 1, 128, 288, 288, 160, 4096] + - [85, 15353.2] + - - [256, 4096, 1, 256, 288, 288, 288, 4096] + - [63, 22876.8] + - - [256, 4096, 1, 512, 288, 288, 544, 4096] + - [81, 30093.7] + - - [256, 4096, 1, 1024, 288, 288, 1056, 4096] + - [55, 35581.5] + - - [256, 4096, 1, 2048, 288, 288, 2080, 4096] + - [63, 37636.2] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [93, 40145.2] + - - [384, 64, 1, 64, 416, 416, 96, 96] + - [0, 237.557] + - - [384, 64, 1, 128, 416, 416, 160, 160] + - [67, 427.525] + - - [384, 64, 1, 256, 416, 416, 288, 288] + - [68, 727.59] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [68, 1096.26] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [87, 1486.9] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [76, 1850.63] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [59, 1979.19] + - - [384, 128, 1, 64, 416, 416, 96, 128] + - [41, 594.993] + - - [384, 128, 1, 128, 416, 416, 160, 160] + - [39, 1042.32] + - - [384, 128, 1, 256, 416, 416, 288, 288] + - [6, 1692.39] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [51, 2483.31] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [88, 3348.08] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [76, 3905.62] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [40, 4033.15] + - - [384, 256, 1, 64, 416, 416, 96, 256] + - [76, 1156.52] + - - [384, 256, 1, 128, 416, 416, 160, 256] + - [39, 2067.52] + - - [384, 256, 1, 256, 416, 416, 288, 288] + - [41, 3255.18] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [51, 4899.89] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [50, 6593.95] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [88, 7707.46] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [88, 8029.62] + - - [384, 512, 1, 64, 416, 416, 96, 512] + - [86, 2072.28] + - - [384, 512, 1, 128, 416, 416, 160, 512] + - [85, 3562.05] + - - [384, 512, 1, 256, 416, 416, 288, 512] + - [51, 6016.21] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [88, 9195.51] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [51, 12683.6] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [87, 15155.6] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [88, 16740.3] + - - [384, 1024, 1, 64, 416, 416, 96, 1024] + - [30, 3623.07] + - - [384, 1024, 1, 128, 416, 416, 160, 1024] + - [28, 6477.69] + - - [384, 1024, 1, 256, 416, 416, 288, 1024] + - [90, 11132.9] + - - [384, 1024, 1, 512, 416, 416, 544, 1024] + - [32, 17228.0] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [26, 22142.1] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [34, 26662.3] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [90, 28382.4] + - - [384, 2048, 1, 64, 416, 416, 96, 2048] + - [77, 8056.93] + - - [384, 2048, 1, 128, 416, 416, 160, 2048] + - [75, 13508.3] + - - [384, 2048, 1, 256, 416, 416, 288, 2048] + - [81, 19806.0] + - - [384, 2048, 1, 512, 416, 416, 544, 2048] + - [81, 28038.0] + - - [384, 2048, 1, 1024, 416, 416, 1056, 2048] + - [81, 33376.4] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [93, 36319.3] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [93, 38891.5] + - - [384, 4096, 1, 64, 416, 416, 96, 4096] + - [74, 11956.7] + - - [384, 4096, 1, 128, 416, 416, 160, 4096] + - [73, 19519.7] + - - [384, 4096, 1, 256, 416, 416, 288, 4096] + - [85, 26332.7] + - - [384, 4096, 1, 512, 416, 416, 544, 4096] + - [13, 31788.8] + - - [384, 4096, 1, 1024, 416, 416, 1056, 4096] + - [74, 34219.6] + - - [384, 4096, 1, 2048, 416, 416, 2080, 4096] + - [73, 37183.3] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [41, 38906.3] + - - [768, 64, 1, 64, 800, 800, 96, 96] + - [3, 518.842] + - - [768, 64, 1, 128, 800, 800, 160, 160] + - [36, 858.785] + - - [768, 64, 1, 256, 800, 800, 288, 288] + - [36, 1516.56] + - - [768, 64, 1, 512, 800, 800, 544, 544] + - [76, 2285.93] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [59, 3034.95] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [40, 3732.01] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [58, 3974.32] + - - [768, 128, 1, 64, 800, 800, 96, 128] + - [76, 1141.62] + - - [768, 128, 1, 128, 800, 800, 160, 160] + - [38, 2028.19] + - - [768, 128, 1, 256, 800, 800, 288, 288] + - [38, 3342.07] + - - [768, 128, 1, 512, 800, 800, 544, 544] + - [50, 4851.24] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [77, 6556.16] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [59, 7680.13] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [41, 7994.71] + - - [768, 256, 1, 64, 800, 800, 96, 256] + - [85, 2034.42] + - - [768, 256, 1, 128, 800, 800, 160, 256] + - [40, 3648.81] + - - [768, 256, 1, 256, 800, 800, 288, 288] + - [50, 5934.64] + - - [768, 256, 1, 512, 800, 800, 544, 544] + - [50, 9104.88] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [50, 12528.1] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [76, 14913.6] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [76, 16691.3] + - - [768, 512, 1, 64, 800, 800, 96, 512] + - [19, 3772.42] + - - [768, 512, 1, 128, 800, 800, 160, 512] + - [85, 6467.7] + - - [768, 512, 1, 256, 800, 800, 288, 512] + - [20, 10793.8] + - - [768, 512, 1, 512, 800, 800, 544, 544] + - [70, 17176.6] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [15, 22504.6] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [31, 27179.7] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [31, 28915.4] + - - [768, 1024, 1, 64, 800, 800, 96, 1024] + - [84, 8033.78] + - - [768, 1024, 1, 128, 800, 800, 160, 1024] + - [29, 13200.0] + - - [768, 1024, 1, 256, 800, 800, 288, 1024] + - [81, 19763.1] + - - [768, 1024, 1, 512, 800, 800, 544, 1024] + - [82, 27977.6] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [93, 33266.1] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [93, 35939.2] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [93, 39032.4] + - - [768, 2048, 1, 64, 800, 800, 96, 2048] + - [73, 12340.7] + - - [768, 2048, 1, 128, 800, 800, 160, 2048] + - [13, 19012.8] + - - [768, 2048, 1, 256, 800, 800, 288, 2048] + - [94, 26329.3] + - - [768, 2048, 1, 512, 800, 800, 544, 2048] + - [81, 32810.8] + - - [768, 2048, 1, 1024, 800, 800, 1056, 2048] + - [93, 35496.4] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [46, 38748.8] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [94, 40602.6] + - - [768, 4096, 1, 64, 800, 800, 96, 4096] + - [92, 16915.4] + - - [768, 4096, 1, 128, 800, 800, 160, 4096] + - [83, 24720.8] + - - [768, 4096, 1, 256, 800, 800, 288, 4096] + - [63, 31396.0] + - - [768, 4096, 1, 512, 800, 800, 544, 4096] + - [46, 35174.7] + - - [768, 4096, 1, 1024, 800, 800, 1056, 4096] + - [63, 39080.7] + - - [768, 4096, 1, 2048, 800, 800, 2080, 4096] + - [81, 41196.1] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [93, 41732.1] + - - [1536, 64, 1, 64, 1568, 1568, 96, 96] + - [0, 961.555] + - - [1536, 64, 1, 128, 1568, 1568, 160, 160] + - [85, 1732.47] + - - [1536, 64, 1, 256, 1568, 1568, 288, 288] + - [87, 2935.82] + - - [1536, 64, 1, 512, 1568, 1568, 544, 544] + - [68, 4522.97] + - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] + - [58, 6027.74] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [58, 7438.1] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [50, 7991.38] + - - [1536, 128, 1, 64, 1568, 1568, 96, 128] + - [41, 1995.39] + - - [1536, 128, 1, 128, 1568, 1568, 160, 160] + - [38, 3640.9] + - - [1536, 128, 1, 256, 1568, 1568, 288, 288] + - [87, 6077.99] + - - [1536, 128, 1, 512, 1568, 1568, 544, 544] + - [50, 9274.33] + - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] + - [58, 12202.4] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [41, 14979.7] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [41, 16861.9] + - - [1536, 256, 1, 64, 1568, 1568, 96, 256] + - [36, 3744.36] + - - [1536, 256, 1, 128, 1568, 1568, 160, 256] + - [60, 6428.89] + - - [1536, 256, 1, 256, 1568, 1568, 288, 288] + - [89, 11314.4] + - - [1536, 256, 1, 512, 1568, 1568, 544, 544] + - [32, 16829.1] + - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] + - [42, 22465.7] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [4, 26640.2] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [52, 28402.9] + - - [1536, 512, 1, 64, 1568, 1568, 96, 512] + - [87, 7699.5] + - - [1536, 512, 1, 128, 1568, 1568, 160, 512] + - [75, 12940.4] + - - [1536, 512, 1, 256, 1568, 1568, 288, 512] + - [72, 20375.1] + - - [1536, 512, 1, 512, 1568, 1568, 544, 544] + - [93, 27291.2] + - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] + - [63, 33830.8] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [63, 36211.5] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [93, 38960.7] + - - [1536, 1024, 1, 64, 1568, 1568, 96, 1024] + - [83, 11834.4] + - - [1536, 1024, 1, 128, 1568, 1568, 160, 1024] + - [57, 19533.0] + - - [1536, 1024, 1, 256, 1568, 1568, 288, 1024] + - [22, 25987.7] + - - [1536, 1024, 1, 512, 1568, 1568, 544, 1024] + - [81, 33271.7] + - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] + - [63, 35717.6] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [63, 38785.7] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [94, 40624.1] + - - [1536, 2048, 1, 64, 1568, 1568, 96, 2048] + - [92, 17182.4] + - - [1536, 2048, 1, 128, 1568, 1568, 160, 2048] + - [75, 25107.8] + - - [1536, 2048, 1, 256, 1568, 1568, 288, 2048] + - [63, 31219.5] + - - [1536, 2048, 1, 512, 1568, 1568, 544, 2048] + - [81, 35398.9] + - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 2048] + - [81, 39111.6] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [81, 41169.6] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [93, 41751.3] + - - [1536, 4096, 1, 64, 1568, 1568, 96, 4096] + - [91, 22034.3] + - - [1536, 4096, 1, 128, 1568, 1568, 160, 4096] + - [85, 29172.5] + - - [1536, 4096, 1, 256, 1568, 1568, 288, 4096] + - [93, 33740.7] + - - [1536, 4096, 1, 512, 1568, 1568, 544, 4096] + - [93, 38246.9] + - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 4096] + - [63, 40816.9] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 4096] + - [94, 41277.3] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [46, 41750.4] + - - [3072, 64, 1, 64, 3104, 3104, 96, 96] + - [35, 1578.0] + - - [3072, 64, 1, 128, 3104, 3104, 160, 160] + - [4, 2931.37] + - - [3072, 64, 1, 256, 3104, 3104, 288, 288] + - [4, 4832.16] + - - [3072, 64, 1, 512, 3104, 3104, 544, 544] + - [27, 7409.34] + - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] + - [11, 10308.1] + - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] + - [78, 12814.4] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [42, 13927.1] + - - [3072, 128, 1, 64, 3104, 3104, 96, 128] + - [36, 3979.43] + - - [3072, 128, 1, 128, 3104, 3104, 160, 160] + - [43, 6292.26] + - - [3072, 128, 1, 256, 3104, 3104, 288, 288] + - [31, 11131.6] + - - [3072, 128, 1, 512, 3104, 3104, 544, 544] + - [6, 16619.3] + - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] + - [60, 22457.0] + - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] + - [4, 26709.1] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [4, 28908.6] + - - [3072, 256, 1, 64, 3104, 3104, 96, 256] + - [58, 7641.06] + - - [3072, 256, 1, 128, 3104, 3104, 160, 256] + - [61, 13427.1] + - - [3072, 256, 1, 256, 3104, 3104, 288, 288] + - [63, 20288.9] + - - [3072, 256, 1, 512, 3104, 3104, 544, 544] + - [64, 27274.5] + - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] + - [63, 33859.2] + - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] + - [47, 36139.2] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [72, 38961.1] + - - [3072, 512, 1, 64, 3104, 3104, 96, 512] + - [65, 11867.9] + - - [3072, 512, 1, 128, 3104, 3104, 160, 512] + - [83, 19514.1] + - - [3072, 512, 1, 256, 3104, 3104, 288, 512] + - [63, 25869.2] + - - [3072, 512, 1, 512, 3104, 3104, 544, 544] + - [63, 33237.3] + - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] + - [63, 35725.5] + - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] + - [64, 38951.2] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [72, 40659.2] + - - [3072, 1024, 1, 64, 3104, 3104, 96, 1024] + - [80, 17518.9] + - - [3072, 1024, 1, 128, 3104, 3104, 160, 1024] + - [57, 24552.1] + - - [3072, 1024, 1, 256, 3104, 3104, 288, 1024] + - [81, 31895.9] + - - [3072, 1024, 1, 512, 3104, 3104, 544, 1024] + - [81, 35489.4] + - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] + - [81, 39129.6] + - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] + - [81, 41200.3] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [93, 41732.1] + - - [3072, 2048, 1, 64, 3104, 3104, 96, 2048] + - [80, 21727.5] + - - [3072, 2048, 1, 128, 3104, 3104, 160, 2048] + - [75, 29237.1] + - - [3072, 2048, 1, 256, 3104, 3104, 288, 2048] + - [93, 33840.7] + - - [3072, 2048, 1, 512, 3104, 3104, 544, 2048] + - [81, 38302.4] + - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 2048] + - [63, 40853.4] + - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] + - [81, 41506.4] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [63, 41389.9] + - - [3072, 4096, 1, 64, 3104, 3104, 96, 4096] + - [45, 6763.2] + - - [3072, 4096, 1, 128, 3104, 3104, 160, 4096] + - [53, 12835.5] + - - [3072, 4096, 1, 256, 3104, 3104, 288, 4096] + - [45, 23148.5] + - - [3072, 4096, 1, 512, 3104, 3104, 544, 4096] + - [92, 35655.0] + - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 4096] + - [93, 39466.7] + - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 4096] + - [64, 41295.0] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [81, 41849.2] + - - [4096, 64, 1, 64, 4128, 4128, 96, 96] + - [78, 1791.1] + - - [4096, 64, 1, 128, 4128, 4128, 160, 160] + - [10, 3576.09] + - - [4096, 64, 1, 256, 4128, 4128, 288, 288] + - [14, 5959.42] + - - [4096, 64, 1, 512, 4128, 4128, 544, 544] + - [14, 9558.99] + - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] + - [33, 13081.7] + - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] + - [12, 16531.3] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [11, 18058.3] + - - [4096, 128, 1, 64, 4128, 4128, 96, 128] + - [68, 5773.3] + - - [4096, 128, 1, 128, 4128, 4128, 160, 160] + - [42, 10035.7] + - - [4096, 128, 1, 256, 4128, 4128, 288, 288] + - [12, 16141.6] + - - [4096, 128, 1, 512, 4128, 4128, 544, 544] + - [24, 23615.3] + - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] + - [3, 30533.6] + - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] + - [42, 35860.8] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [0, 37730.8] + - - [4096, 256, 1, 64, 4128, 4128, 96, 256] + - [68, 9804.11] + - - [4096, 256, 1, 128, 4128, 4128, 160, 256] + - [62, 15429.1] + - - [4096, 256, 1, 256, 4128, 4128, 288, 288] + - [93, 22639.5] + - - [4096, 256, 1, 512, 4128, 4128, 544, 544] + - [63, 30681.9] + - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] + - [81, 35673.7] + - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] + - [63, 37833.2] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [46, 40145.6] + - - [4096, 512, 1, 64, 4128, 4128, 96, 512] + - [71, 13770.2] + - - [4096, 512, 1, 128, 4128, 4128, 160, 512] + - [75, 21377.4] + - - [4096, 512, 1, 256, 4128, 4128, 288, 512] + - [63, 28915.4] + - - [4096, 512, 1, 512, 4128, 4128, 544, 544] + - [81, 35019.8] + - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] + - [63, 37461.6] + - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] + - [81, 40218.5] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [93, 41715.8] + - - [4096, 1024, 1, 64, 4128, 4128, 96, 1024] + - [80, 18825.8] + - - [4096, 1024, 1, 128, 4128, 4128, 160, 1024] + - [75, 27206.8] + - - [4096, 1024, 1, 256, 4128, 4128, 288, 1024] + - [81, 33398.9] + - - [4096, 1024, 1, 512, 4128, 4128, 544, 1024] + - [63, 36738.7] + - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] + - [81, 39857.6] + - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] + - [81, 41701.6] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [63, 41326.1] + - - [4096, 2048, 1, 64, 4128, 4128, 96, 2048] + - [91, 23035.8] + - - [4096, 2048, 1, 128, 4128, 4128, 160, 2048] + - [85, 30768.9] + - - [4096, 2048, 1, 256, 4128, 4128, 288, 2048] + - [35, 34131.5] + - - [4096, 2048, 1, 512, 4128, 4128, 544, 2048] + - [88, 37340.4] + - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 2048] + - [94, 39941.3] + - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] + - [93, 40407.9] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [81, 41869.1] + - - [4096, 4096, 1, 64, 4128, 4128, 96, 4096] + - [45, 7040.61] + - - [4096, 4096, 1, 128, 4128, 4128, 160, 4096] + - [53, 13693.6] + - - [4096, 4096, 1, 256, 4128, 4128, 288, 4096] + - [71, 24216.1] + - - [4096, 4096, 1, 512, 4128, 4128, 544, 4096] + - [54, 36638.2] + - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 4096] + - [64, 38693.3] + - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 4096] + - [81, 41280.4] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [93, 41825.4] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_SB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_SB.yaml new file mode 100644 index 00000000000..3533e22ee6b --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bjlk_SB.yaml @@ -0,0 +1,310 @@ +- {MinimumRequiredVersion: 4.33.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 32 + LSPA: 32 + LSPB: 8 + LVCA: 8 + LVCB: 32 + LVPA: 32 + LVPB: 8 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 512 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: true + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_SB_MT32x32x8_SN_ + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWforTLUandMI: false +- [2, 3, 0, 1] +- - - [126, 126, 2, 66, 126, 126, 66, 126] + - [0, 0] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_BBS_BH.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_BBS_BH.yaml new file mode 100644 index 00000000000..0f2c3916604 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_BBS_BH.yaml @@ -0,0 +1,9213 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [3, 32.605] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [23, 61.5651] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [11, 107.249] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [12, 161.942] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [23, 232.397] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [2, 296.25] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [11, 333.861] + - - [64, 128, 1, 64, 96, 96, 96, 96] + - [12, 60.4855] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [23, 113.765] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [11, 199.482] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [11, 314.321] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [22, 456.548] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [11, 587.335] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [26, 658.123] + - - [64, 256, 1, 64, 96, 96, 96, 96] + - [10, 132.765] + - - [64, 256, 1, 128, 96, 96, 160, 160] + - [23, 247.188] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [3, 429.524] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [20, 685.848] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [9, 969.333] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [2, 1229.55] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [11, 1331.68] + - - [64, 512, 1, 64, 96, 96, 96, 96] + - [12, 297.131] + - - [64, 512, 1, 128, 96, 96, 160, 160] + - [20, 547.202] + - - [64, 512, 1, 256, 96, 96, 288, 288] + - [20, 939.897] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [29, 1478.43] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [20, 2050.63] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [7, 2550.6] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [9, 2824.33] + - - [64, 1024, 1, 64, 96, 96, 96, 96] + - [9, 609.637] + - - [64, 1024, 1, 128, 96, 96, 160, 160] + - [20, 1124.93] + - - [64, 1024, 1, 256, 96, 96, 288, 288] + - [5, 1866.0] + - - [64, 1024, 1, 512, 96, 96, 544, 544] + - [20, 2949.32] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [5, 4099.75] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [5, 5132.02] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [2, 5855.53] + - - [64, 2048, 1, 64, 96, 96, 96, 96] + - [10, 1154.98] + - - [64, 2048, 1, 128, 96, 96, 160, 160] + - [20, 2129.36] + - - [64, 2048, 1, 256, 96, 96, 288, 288] + - [12, 3665.55] + - - [64, 2048, 1, 512, 96, 96, 544, 544] + - [5, 5800.25] + - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] + - [9, 8203.52] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [5, 10521.5] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [20, 12185.0] + - - [64, 4096, 1, 64, 96, 96, 96, 96] + - [8, 2075.36] + - - [64, 4096, 1, 128, 96, 96, 160, 160] + - [4, 3737.82] + - - [64, 4096, 1, 256, 96, 96, 288, 288] + - [8, 6204.58] + - - [64, 4096, 1, 512, 96, 96, 544, 544] + - [22, 9318.73] + - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] + - [7, 12865.3] + - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] + - [2, 15730.2] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [8, 16167.9] + - - [128, 64, 1, 64, 160, 160, 96, 96] + - [23, 69.1126] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [23, 131.039] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [10, 224.775] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [12, 340.143] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [22, 462.105] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [7, 590.934] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [7, 657.35] + - - [128, 128, 1, 64, 160, 160, 96, 96] + - [29, 144.771] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [12, 292.817] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [7, 494.845] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [7, 761.562] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [2, 1041.42] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [22, 1277.92] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [11, 1372.37] + - - [128, 256, 1, 64, 160, 160, 96, 96] + - [12, 335.221] + - - [128, 256, 1, 128, 160, 160, 160, 160] + - [12, 633.485] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [23, 1061.85] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [22, 1614.75] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [14, 2150.92] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [28, 2624.31] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [2, 2868.7] + - - [128, 512, 1, 64, 160, 160, 96, 96] + - [10, 698.236] + - - [128, 512, 1, 128, 160, 160, 160, 160] + - [12, 1264.68] + - - [128, 512, 1, 256, 160, 160, 288, 288] + - [11, 2119.41] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [2, 3233.85] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [2, 4346.43] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [2, 5289.16] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [28, 5909.29] + - - [128, 1024, 1, 64, 160, 160, 96, 96] + - [10, 1407.01] + - - [128, 1024, 1, 128, 160, 160, 160, 160] + - [12, 2551.66] + - - [128, 1024, 1, 256, 160, 160, 288, 288] + - [17, 4451.37] + - - [128, 1024, 1, 512, 160, 160, 544, 544] + - [16, 6727.71] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [22, 8930.58] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [2, 10861.7] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [11, 12324.6] + - - [128, 2048, 1, 64, 160, 160, 96, 96] + - [15, 2711.69] + - - [128, 2048, 1, 128, 160, 160, 160, 160] + - [3, 4708.73] + - - [128, 2048, 1, 256, 160, 160, 288, 288] + - [7, 7969.23] + - - [128, 2048, 1, 512, 160, 160, 544, 544] + - [9, 12252.8] + - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] + - [2, 16982.1] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [5, 21355.2] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [26, 24447.7] + - - [128, 4096, 1, 64, 160, 160, 96, 96] + - [18, 4321.24] + - - [128, 4096, 1, 128, 160, 160, 160, 160] + - [30, 7756.46] + - - [128, 4096, 1, 256, 160, 160, 288, 288] + - [25, 12868.4] + - - [128, 4096, 1, 512, 160, 160, 544, 544] + - [2, 19392.8] + - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] + - [2, 26589.6] + - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] + - [7, 31724.3] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [8, 32440.3] + - - [256, 64, 1, 64, 288, 288, 96, 96] + - [12, 146.102] + - - [256, 64, 1, 128, 288, 288, 160, 160] + - [9, 273.458] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [7, 470.583] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [26, 740.978] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [28, 971.408] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [28, 1224.61] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [2, 1327.1] + - - [256, 128, 1, 64, 288, 288, 96, 96] + - [12, 331.512] + - - [256, 128, 1, 128, 288, 288, 160, 160] + - [1, 631.102] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [3, 1016.55] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [7, 1560.53] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [22, 2116.6] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [7, 2578.83] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [7, 2809.84] + - - [256, 256, 1, 64, 288, 288, 96, 96] + - [12, 698.12] + - - [256, 256, 1, 128, 288, 288, 160, 160] + - [6, 1256.34] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [7, 2103.73] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [11, 3194.75] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [2, 4311.25] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [2, 5203.24] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [2, 5754.24] + - - [256, 512, 1, 64, 288, 288, 96, 96] + - [23, 1416.03] + - - [256, 512, 1, 128, 288, 288, 160, 160] + - [12, 2546.63] + - - [256, 512, 1, 256, 288, 288, 288, 288] + - [2, 4239.35] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [22, 6454.02] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [7, 8655.84] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [2, 10428.3] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [28, 11588.0] + - - [256, 1024, 1, 64, 288, 288, 96, 96] + - [23, 2638.34] + - - [256, 1024, 1, 128, 288, 288, 160, 160] + - [21, 4975.45] + - - [256, 1024, 1, 256, 288, 288, 288, 288] + - [22, 8344.8] + - - [256, 1024, 1, 512, 288, 288, 544, 544] + - [22, 12664.4] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [7, 16999.3] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [22, 21029.8] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [28, 23312.3] + - - [256, 2048, 1, 64, 288, 288, 96, 96] + - [26, 4375.9] + - - [256, 2048, 1, 128, 288, 288, 160, 160] + - [11, 7866.47] + - - [256, 2048, 1, 256, 288, 288, 288, 288] + - [2, 13016.9] + - - [256, 2048, 1, 512, 288, 288, 544, 544] + - [11, 19505.5] + - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] + - [11, 26431.2] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [2, 31527.8] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [28, 33907.8] + - - [256, 4096, 1, 64, 288, 288, 96, 96] + - [12, 8261.59] + - - [256, 4096, 1, 128, 288, 288, 160, 160] + - [11, 13575.2] + - - [256, 4096, 1, 256, 288, 288, 288, 288] + - [3, 20254.7] + - - [256, 4096, 1, 512, 288, 288, 544, 544] + - [2, 26952.7] + - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] + - [11, 32097.0] + - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] + - [2, 34026.0] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [8, 34426.4] + - - [384, 64, 1, 64, 416, 416, 96, 96] + - [12, 224.984] + - - [384, 64, 1, 128, 416, 416, 160, 160] + - [12, 419.766] + - - [384, 64, 1, 256, 416, 416, 288, 288] + - [7, 749.784] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [14, 1117.59] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [20, 1487.87] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [28, 1844.93] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [11, 1994.4] + - - [384, 128, 1, 64, 416, 416, 96, 96] + - [23, 507.785] + - - [384, 128, 1, 128, 416, 416, 160, 160] + - [21, 972.852] + - - [384, 128, 1, 256, 416, 416, 288, 288] + - [11, 1618.38] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [11, 2438.79] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [11, 3266.8] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [28, 3916.71] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [2, 4290.85] + - - [384, 256, 1, 64, 416, 416, 96, 96] + - [10, 1059.17] + - - [384, 256, 1, 128, 416, 416, 160, 160] + - [17, 1905.06] + - - [384, 256, 1, 256, 416, 416, 288, 288] + - [29, 3160.34] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [11, 4812.28] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [11, 6441.63] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [2, 7810.93] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [2, 8601.67] + - - [384, 512, 1, 64, 416, 416, 96, 96] + - [29, 2043.34] + - - [384, 512, 1, 128, 416, 416, 160, 160] + - [29, 3842.7] + - - [384, 512, 1, 256, 416, 416, 288, 288] + - [2, 6398.63] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [11, 9694.08] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [0, 12924.6] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [2, 15635.8] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [22, 16866.8] + - - [384, 1024, 1, 64, 416, 416, 96, 96] + - [22, 3565.57] + - - [384, 1024, 1, 128, 416, 416, 160, 160] + - [28, 6357.41] + - - [384, 1024, 1, 256, 416, 416, 288, 288] + - [11, 10421.7] + - - [384, 1024, 1, 512, 416, 416, 544, 544] + - [0, 15282.1] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [2, 20423.7] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [16, 23986.0] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [22, 25670.0] + - - [384, 2048, 1, 64, 416, 416, 96, 96] + - [27, 5651.43] + - - [384, 2048, 1, 128, 416, 416, 160, 160] + - [21, 10570.5] + - - [384, 2048, 1, 256, 416, 416, 288, 288] + - [23, 17270.9] + - - [384, 2048, 1, 512, 416, 416, 544, 544] + - [0, 23414.2] + - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] + - [22, 28715.8] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [2, 30728.1] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [22, 33564.9] + - - [384, 4096, 1, 64, 416, 416, 96, 96] + - [23, 9321.54] + - - [384, 4096, 1, 128, 416, 416, 160, 160] + - [23, 14657.9] + - - [384, 4096, 1, 256, 416, 416, 288, 288] + - [3, 20453.8] + - - [384, 4096, 1, 512, 416, 416, 544, 544] + - [23, 25958.3] + - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] + - [22, 28841.3] + - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] + - [11, 31910.1] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [22, 32506.7] + - - [768, 64, 1, 64, 800, 800, 96, 96] + - [3, 488.847] + - - [768, 64, 1, 128, 800, 800, 160, 160] + - [14, 902.389] + - - [768, 64, 1, 256, 800, 800, 288, 288] + - [7, 1538.82] + - - [768, 64, 1, 512, 800, 800, 544, 544] + - [9, 2232.6] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [22, 2967.66] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [22, 3708.08] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [9, 3979.81] + - - [768, 128, 1, 64, 800, 800, 96, 96] + - [29, 992.97] + - - [768, 128, 1, 128, 800, 800, 160, 160] + - [10, 1889.89] + - - [768, 128, 1, 256, 800, 800, 288, 288] + - [0, 3151.64] + - - [768, 128, 1, 512, 800, 800, 544, 544] + - [22, 4777.57] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [2, 6421.49] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [22, 7729.65] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [2, 8118.82] + - - [768, 256, 1, 64, 800, 800, 96, 96] + - [23, 2130.53] + - - [768, 256, 1, 128, 800, 800, 160, 160] + - [15, 3825.17] + - - [768, 256, 1, 256, 800, 800, 288, 288] + - [2, 6370.29] + - - [768, 256, 1, 512, 800, 800, 544, 544] + - [2, 9640.21] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [2, 12865.9] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [11, 15514.7] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [16, 16705.9] + - - [768, 512, 1, 64, 800, 800, 96, 96] + - [16, 3582.83] + - - [768, 512, 1, 128, 800, 800, 160, 160] + - [22, 6368.68] + - - [768, 512, 1, 256, 800, 800, 288, 288] + - [2, 10405.6] + - - [768, 512, 1, 512, 800, 800, 544, 544] + - [2, 15381.4] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [2, 20583.4] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [8, 24014.6] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [8, 25511.4] + - - [768, 1024, 1, 64, 800, 800, 96, 96] + - [10, 5691.05] + - - [768, 1024, 1, 128, 800, 800, 160, 160] + - [10, 10631.9] + - - [768, 1024, 1, 256, 800, 800, 288, 288] + - [3, 16513.0] + - - [768, 1024, 1, 512, 800, 800, 544, 544] + - [20, 22687.2] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [9, 27700.4] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [28, 30531.8] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [22, 33538.3] + - - [768, 2048, 1, 64, 800, 800, 96, 96] + - [28, 10093.6] + - - [768, 2048, 1, 128, 800, 800, 160, 160] + - [16, 16134.5] + - - [768, 2048, 1, 256, 800, 800, 288, 288] + - [23, 22714.1] + - - [768, 2048, 1, 512, 800, 800, 544, 544] + - [2, 28840.3] + - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] + - [22, 31995.9] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [22, 35301.5] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [22, 37370.5] + - - [768, 4096, 1, 64, 800, 800, 96, 96] + - [12, 12722.1] + - - [768, 4096, 1, 128, 800, 800, 160, 160] + - [2, 19122.0] + - - [768, 4096, 1, 256, 800, 800, 288, 288] + - [23, 25389.5] + - - [768, 4096, 1, 512, 800, 800, 544, 544] + - [2, 29809.0] + - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] + - [28, 34232.7] + - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] + - [28, 37178.1] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [28, 36702.3] + - - [1536, 64, 1, 64, 1568, 1568, 96, 96] + - [2, 927.67] + - - [1536, 64, 1, 128, 1568, 1568, 160, 160] + - [9, 1734.14] + - - [1536, 64, 1, 256, 1568, 1568, 288, 288] + - [0, 2950.96] + - - [1536, 64, 1, 512, 1568, 1568, 544, 544] + - [9, 4341.56] + - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] + - [2, 5924.16] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [2, 7453.77] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [22, 8251.09] + - - [1536, 128, 1, 64, 1568, 1568, 96, 96] + - [19, 1893.59] + - - [1536, 128, 1, 128, 1568, 1568, 160, 160] + - [10, 3619.94] + - - [1536, 128, 1, 256, 1568, 1568, 288, 288] + - [22, 6060.4] + - - [1536, 128, 1, 512, 1568, 1568, 544, 544] + - [11, 9258.08] + - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] + - [22, 12492.3] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [11, 15425.5] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [11, 17053.3] + - - [1536, 256, 1, 64, 1568, 1568, 96, 96] + - [24, 3418.81] + - - [1536, 256, 1, 128, 1568, 1568, 160, 160] + - [22, 6120.84] + - - [1536, 256, 1, 256, 1568, 1568, 288, 288] + - [22, 10016.2] + - - [1536, 256, 1, 512, 1568, 1568, 544, 544] + - [13, 14862.4] + - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] + - [25, 19838.0] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [25, 24074.2] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [4, 25641.8] + - - [1536, 512, 1, 64, 1568, 1568, 96, 96] + - [29, 5762.73] + - - [1536, 512, 1, 128, 1568, 1568, 160, 160] + - [2, 10113.9] + - - [1536, 512, 1, 256, 1568, 1568, 288, 288] + - [23, 17705.3] + - - [1536, 512, 1, 512, 1568, 1568, 544, 544] + - [2, 23522.2] + - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] + - [11, 28515.5] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [22, 30715.2] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [2, 33101.7] + - - [1536, 1024, 1, 64, 1568, 1568, 96, 96] + - [26, 9838.09] + - - [1536, 1024, 1, 128, 1568, 1568, 160, 160] + - [28, 15797.8] + - - [1536, 1024, 1, 256, 1568, 1568, 288, 288] + - [23, 22503.4] + - - [1536, 1024, 1, 512, 1568, 1568, 544, 544] + - [20, 28510.5] + - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] + - [22, 32034.1] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [22, 35425.7] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [22, 37301.5] + - - [1536, 2048, 1, 64, 1568, 1568, 96, 96] + - [23, 12454.5] + - - [1536, 2048, 1, 128, 1568, 1568, 160, 160] + - [23, 19030.8] + - - [1536, 2048, 1, 256, 1568, 1568, 288, 288] + - [29, 25477.1] + - - [1536, 2048, 1, 512, 1568, 1568, 544, 544] + - [20, 30053.8] + - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 1056] + - [28, 34385.4] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [28, 37109.2] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [11, 37440.0] + - - [1536, 4096, 1, 64, 1568, 1568, 96, 96] + - [12, 14424.3] + - - [1536, 4096, 1, 128, 1568, 1568, 160, 160] + - [3, 19898.3] + - - [1536, 4096, 1, 256, 1568, 1568, 288, 288] + - [23, 26513.5] + - - [1536, 4096, 1, 512, 1568, 1568, 544, 544] + - [22, 32001.3] + - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 1056] + - [28, 35774.7] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] + - [28, 37333.3] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [14, 37436.3] + - - [3072, 64, 1, 64, 3104, 3104, 96, 96] + - [25, 1746.17] + - - [3072, 64, 1, 128, 3104, 3104, 160, 160] + - [14, 3297.84] + - - [3072, 64, 1, 256, 3104, 3104, 288, 288] + - [13, 4905.14] + - - [3072, 64, 1, 512, 3104, 3104, 544, 544] + - [4, 7327.36] + - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] + - [25, 9869.43] + - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] + - [4, 11835.8] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [8, 12559.4] + - - [3072, 128, 1, 64, 3104, 3104, 96, 96] + - [11, 3175.1] + - - [3072, 128, 1, 128, 3104, 3104, 160, 160] + - [8, 6196.19] + - - [3072, 128, 1, 256, 3104, 3104, 288, 288] + - [25, 10195.8] + - - [3072, 128, 1, 512, 3104, 3104, 544, 544] + - [25, 15087.4] + - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] + - [25, 20395.8] + - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] + - [25, 24136.3] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [4, 25545.4] + - - [3072, 256, 1, 64, 3104, 3104, 96, 96] + - [13, 5372.15] + - - [3072, 256, 1, 128, 3104, 3104, 160, 160] + - [21, 9947.95] + - - [3072, 256, 1, 256, 3104, 3104, 288, 288] + - [29, 16228.1] + - - [3072, 256, 1, 512, 3104, 3104, 544, 544] + - [10, 21933.4] + - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] + - [12, 26465.1] + - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] + - [12, 28809.3] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [15, 30508.6] + - - [3072, 512, 1, 64, 3104, 3104, 96, 96] + - [14, 9767.45] + - - [3072, 512, 1, 128, 3104, 3104, 160, 160] + - [11, 15716.3] + - - [3072, 512, 1, 256, 3104, 3104, 288, 288] + - [12, 22378.3] + - - [3072, 512, 1, 512, 3104, 3104, 544, 544] + - [17, 28223.7] + - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] + - [23, 31268.6] + - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] + - [23, 34506.6] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [11, 35980.9] + - - [3072, 1024, 1, 64, 3104, 3104, 96, 96] + - [28, 12415.3] + - - [3072, 1024, 1, 128, 3104, 3104, 160, 160] + - [22, 18968.9] + - - [3072, 1024, 1, 256, 3104, 3104, 288, 288] + - [17, 25147.7] + - - [3072, 1024, 1, 512, 3104, 3104, 544, 544] + - [17, 29451.5] + - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] + - [22, 33769.0] + - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] + - [11, 36427.6] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [2, 36175.5] + - - [3072, 2048, 1, 64, 3104, 3104, 96, 96] + - [21, 14349.7] + - - [3072, 2048, 1, 128, 3104, 3104, 160, 160] + - [17, 19868.9] + - - [3072, 2048, 1, 256, 3104, 3104, 288, 288] + - [21, 26205.0] + - - [3072, 2048, 1, 512, 3104, 3104, 544, 544] + - [20, 31647.7] + - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 1056] + - [28, 35441.9] + - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] + - [28, 37186.9] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [2, 36606.5] + - - [3072, 4096, 1, 64, 3104, 3104, 96, 96] + - [23, 15083.2] + - - [3072, 4096, 1, 128, 3104, 3104, 160, 160] + - [23, 21854.5] + - - [3072, 4096, 1, 256, 3104, 3104, 288, 288] + - [23, 27887.5] + - - [3072, 4096, 1, 512, 3104, 3104, 544, 544] + - [28, 32940.6] + - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 1056] + - [28, 35740.6] + - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 2080] + - [16, 36999.3] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [13, 35682.4] + - - [4096, 64, 1, 64, 4128, 4128, 96, 96] + - [25, 1794.16] + - - [4096, 64, 1, 128, 4128, 4128, 160, 160] + - [13, 3624.36] + - - [4096, 64, 1, 256, 4128, 4128, 288, 288] + - [8, 6047.48] + - - [4096, 64, 1, 512, 4128, 4128, 544, 544] + - [18, 9277.51] + - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] + - [8, 12762.0] + - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] + - [13, 15591.3] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [18, 16424.1] + - - [4096, 128, 1, 64, 4128, 4128, 96, 96] + - [22, 5059.46] + - - [4096, 128, 1, 128, 4128, 4128, 160, 160] + - [11, 9081.04] + - - [4096, 128, 1, 256, 4128, 4128, 288, 288] + - [4, 14506.9] + - - [4096, 128, 1, 512, 4128, 4128, 544, 544] + - [28, 21203.4] + - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] + - [20, 27458.6] + - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] + - [11, 31860.8] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [25, 32544.0] + - - [4096, 256, 1, 64, 4128, 4128, 96, 96] + - [12, 7860.94] + - - [4096, 256, 1, 128, 4128, 4128, 160, 160] + - [29, 13104.6] + - - [4096, 256, 1, 256, 4128, 4128, 288, 288] + - [28, 19780.1] + - - [4096, 256, 1, 512, 4128, 4128, 544, 544] + - [23, 26154.4] + - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] + - [23, 30932.9] + - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] + - [22, 33237.6] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [4, 34493.3] + - - [4096, 512, 1, 64, 4128, 4128, 96, 96] + - [11, 10777.1] + - - [4096, 512, 1, 128, 4128, 4128, 160, 160] + - [9, 17045.7] + - - [4096, 512, 1, 256, 4128, 4128, 288, 288] + - [29, 23636.1] + - - [4096, 512, 1, 512, 4128, 4128, 544, 544] + - [12, 28937.1] + - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] + - [12, 32215.5] + - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] + - [28, 35312.6] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [25, 35733.6] + - - [4096, 1024, 1, 64, 4128, 4128, 96, 96] + - [27, 13237.8] + - - [4096, 1024, 1, 128, 4128, 4128, 160, 160] + - [29, 19938.0] + - - [4096, 1024, 1, 256, 4128, 4128, 288, 288] + - [23, 24864.9] + - - [4096, 1024, 1, 512, 4128, 4128, 544, 544] + - [20, 30627.6] + - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] + - [16, 34561.3] + - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] + - [28, 37052.7] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [22, 35549.9] + - - [4096, 2048, 1, 64, 4128, 4128, 96, 96] + - [23, 15046.8] + - - [4096, 2048, 1, 128, 4128, 4128, 160, 160] + - [23, 20856.2] + - - [4096, 2048, 1, 256, 4128, 4128, 288, 288] + - [23, 27016.4] + - - [4096, 2048, 1, 512, 4128, 4128, 544, 544] + - [22, 32297.4] + - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 1056] + - [28, 35954.1] + - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] + - [11, 37291.7] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [30, 35673.4] + - - [4096, 4096, 1, 64, 4128, 4128, 96, 96] + - [23, 15017.8] + - - [4096, 4096, 1, 128, 4128, 4128, 160, 160] + - [16, 21305.2] + - - [4096, 4096, 1, 256, 4128, 4128, 288, 288] + - [11, 27487.1] + - - [4096, 4096, 1, 512, 4128, 4128, 544, 544] + - [28, 33021.3] + - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 1056] + - [28, 35981.0] + - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 2080] + - [11, 37429.5] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [25, 35235.8] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_BBS_BH_GB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_BBS_BH_GB.yaml new file mode 100644 index 00000000000..0ad19f83d99 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_BBS_BH_GB.yaml @@ -0,0 +1,9213 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [3, 32.605] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [23, 61.5651] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [11, 107.249] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [12, 161.942] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [23, 232.397] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [2, 296.25] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [11, 333.861] + - - [64, 128, 1, 64, 96, 96, 96, 96] + - [12, 60.4855] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [23, 113.765] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [11, 199.482] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [11, 314.321] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [22, 456.548] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [11, 587.335] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [26, 658.123] + - - [64, 256, 1, 64, 96, 96, 96, 96] + - [10, 132.765] + - - [64, 256, 1, 128, 96, 96, 160, 160] + - [23, 247.188] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [3, 429.524] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [20, 685.848] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [9, 969.333] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [2, 1229.55] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [11, 1331.68] + - - [64, 512, 1, 64, 96, 96, 96, 96] + - [12, 297.131] + - - [64, 512, 1, 128, 96, 96, 160, 160] + - [20, 547.202] + - - [64, 512, 1, 256, 96, 96, 288, 288] + - [20, 939.897] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [29, 1478.43] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [20, 2050.63] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [7, 2550.6] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [9, 2824.33] + - - [64, 1024, 1, 64, 96, 96, 96, 96] + - [9, 609.637] + - - [64, 1024, 1, 128, 96, 96, 160, 160] + - [20, 1124.93] + - - [64, 1024, 1, 256, 96, 96, 288, 288] + - [5, 1866.0] + - - [64, 1024, 1, 512, 96, 96, 544, 544] + - [20, 2949.32] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [5, 4099.75] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [5, 5132.02] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [2, 5855.53] + - - [64, 2048, 1, 64, 96, 96, 96, 96] + - [10, 1154.98] + - - [64, 2048, 1, 128, 96, 96, 160, 160] + - [20, 2129.36] + - - [64, 2048, 1, 256, 96, 96, 288, 288] + - [12, 3665.55] + - - [64, 2048, 1, 512, 96, 96, 544, 544] + - [5, 5800.25] + - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] + - [9, 8203.52] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [5, 10521.5] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [20, 12185.0] + - - [64, 4096, 1, 64, 96, 96, 96, 96] + - [8, 2075.36] + - - [64, 4096, 1, 128, 96, 96, 160, 160] + - [4, 3737.82] + - - [64, 4096, 1, 256, 96, 96, 288, 288] + - [8, 6204.58] + - - [64, 4096, 1, 512, 96, 96, 544, 544] + - [22, 9318.73] + - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] + - [7, 12865.3] + - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] + - [2, 15730.2] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [8, 16167.9] + - - [128, 64, 1, 64, 160, 160, 96, 96] + - [23, 69.1126] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [23, 131.039] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [10, 224.775] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [12, 340.143] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [22, 462.105] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [7, 590.934] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [7, 657.35] + - - [128, 128, 1, 64, 160, 160, 96, 96] + - [29, 144.771] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [12, 292.817] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [7, 494.845] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [7, 761.562] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [2, 1041.42] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [22, 1277.92] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [11, 1372.37] + - - [128, 256, 1, 64, 160, 160, 96, 96] + - [12, 335.221] + - - [128, 256, 1, 128, 160, 160, 160, 160] + - [12, 633.485] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [23, 1061.85] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [22, 1614.75] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [14, 2150.92] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [28, 2624.31] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [2, 2868.7] + - - [128, 512, 1, 64, 160, 160, 96, 96] + - [10, 698.236] + - - [128, 512, 1, 128, 160, 160, 160, 160] + - [12, 1264.68] + - - [128, 512, 1, 256, 160, 160, 288, 288] + - [11, 2119.41] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [2, 3233.85] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [2, 4346.43] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [2, 5289.16] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [28, 5909.29] + - - [128, 1024, 1, 64, 160, 160, 96, 96] + - [10, 1407.01] + - - [128, 1024, 1, 128, 160, 160, 160, 160] + - [12, 2551.66] + - - [128, 1024, 1, 256, 160, 160, 288, 288] + - [17, 4451.37] + - - [128, 1024, 1, 512, 160, 160, 544, 544] + - [16, 6727.71] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [22, 8930.58] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [2, 10861.7] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [11, 12324.6] + - - [128, 2048, 1, 64, 160, 160, 96, 96] + - [15, 2711.69] + - - [128, 2048, 1, 128, 160, 160, 160, 160] + - [3, 4708.73] + - - [128, 2048, 1, 256, 160, 160, 288, 288] + - [7, 7969.23] + - - [128, 2048, 1, 512, 160, 160, 544, 544] + - [9, 12252.8] + - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] + - [2, 16982.1] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [5, 21355.2] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [26, 24447.7] + - - [128, 4096, 1, 64, 160, 160, 96, 96] + - [18, 4321.24] + - - [128, 4096, 1, 128, 160, 160, 160, 160] + - [30, 7756.46] + - - [128, 4096, 1, 256, 160, 160, 288, 288] + - [25, 12868.4] + - - [128, 4096, 1, 512, 160, 160, 544, 544] + - [2, 19392.8] + - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] + - [2, 26589.6] + - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] + - [7, 31724.3] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [8, 32440.3] + - - [256, 64, 1, 64, 288, 288, 96, 96] + - [12, 146.102] + - - [256, 64, 1, 128, 288, 288, 160, 160] + - [9, 273.458] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [7, 470.583] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [26, 740.978] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [28, 971.408] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [28, 1224.61] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [2, 1327.1] + - - [256, 128, 1, 64, 288, 288, 96, 96] + - [12, 331.512] + - - [256, 128, 1, 128, 288, 288, 160, 160] + - [1, 631.102] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [3, 1016.55] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [7, 1560.53] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [22, 2116.6] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [7, 2578.83] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [7, 2809.84] + - - [256, 256, 1, 64, 288, 288, 96, 96] + - [12, 698.12] + - - [256, 256, 1, 128, 288, 288, 160, 160] + - [6, 1256.34] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [7, 2103.73] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [11, 3194.75] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [2, 4311.25] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [2, 5203.24] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [2, 5754.24] + - - [256, 512, 1, 64, 288, 288, 96, 96] + - [23, 1416.03] + - - [256, 512, 1, 128, 288, 288, 160, 160] + - [12, 2546.63] + - - [256, 512, 1, 256, 288, 288, 288, 288] + - [2, 4239.35] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [22, 6454.02] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [7, 8655.84] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [2, 10428.3] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [28, 11588.0] + - - [256, 1024, 1, 64, 288, 288, 96, 96] + - [23, 2638.34] + - - [256, 1024, 1, 128, 288, 288, 160, 160] + - [21, 4975.45] + - - [256, 1024, 1, 256, 288, 288, 288, 288] + - [22, 8344.8] + - - [256, 1024, 1, 512, 288, 288, 544, 544] + - [22, 12664.4] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [7, 16999.3] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [22, 21029.8] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [28, 23312.3] + - - [256, 2048, 1, 64, 288, 288, 96, 96] + - [26, 4375.9] + - - [256, 2048, 1, 128, 288, 288, 160, 160] + - [11, 7866.47] + - - [256, 2048, 1, 256, 288, 288, 288, 288] + - [2, 13016.9] + - - [256, 2048, 1, 512, 288, 288, 544, 544] + - [11, 19505.5] + - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] + - [11, 26431.2] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [2, 31527.8] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [28, 33907.8] + - - [256, 4096, 1, 64, 288, 288, 96, 96] + - [12, 8261.59] + - - [256, 4096, 1, 128, 288, 288, 160, 160] + - [11, 13575.2] + - - [256, 4096, 1, 256, 288, 288, 288, 288] + - [3, 20254.7] + - - [256, 4096, 1, 512, 288, 288, 544, 544] + - [2, 26952.7] + - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] + - [11, 32097.0] + - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] + - [2, 34026.0] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [8, 34426.4] + - - [384, 64, 1, 64, 416, 416, 96, 96] + - [12, 224.984] + - - [384, 64, 1, 128, 416, 416, 160, 160] + - [12, 419.766] + - - [384, 64, 1, 256, 416, 416, 288, 288] + - [7, 749.784] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [14, 1117.59] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [20, 1487.87] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [28, 1844.93] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [11, 1994.4] + - - [384, 128, 1, 64, 416, 416, 96, 96] + - [23, 507.785] + - - [384, 128, 1, 128, 416, 416, 160, 160] + - [21, 972.852] + - - [384, 128, 1, 256, 416, 416, 288, 288] + - [11, 1618.38] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [11, 2438.79] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [11, 3266.8] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [28, 3916.71] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [2, 4290.85] + - - [384, 256, 1, 64, 416, 416, 96, 96] + - [10, 1059.17] + - - [384, 256, 1, 128, 416, 416, 160, 160] + - [17, 1905.06] + - - [384, 256, 1, 256, 416, 416, 288, 288] + - [29, 3160.34] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [11, 4812.28] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [11, 6441.63] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [2, 7810.93] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [2, 8601.67] + - - [384, 512, 1, 64, 416, 416, 96, 96] + - [29, 2043.34] + - - [384, 512, 1, 128, 416, 416, 160, 160] + - [29, 3842.7] + - - [384, 512, 1, 256, 416, 416, 288, 288] + - [2, 6398.63] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [11, 9694.08] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [0, 12924.6] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [2, 15635.8] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [22, 16866.8] + - - [384, 1024, 1, 64, 416, 416, 96, 96] + - [22, 3565.57] + - - [384, 1024, 1, 128, 416, 416, 160, 160] + - [28, 6357.41] + - - [384, 1024, 1, 256, 416, 416, 288, 288] + - [11, 10421.7] + - - [384, 1024, 1, 512, 416, 416, 544, 544] + - [0, 15282.1] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [2, 20423.7] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [16, 23986.0] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [22, 25670.0] + - - [384, 2048, 1, 64, 416, 416, 96, 96] + - [27, 5651.43] + - - [384, 2048, 1, 128, 416, 416, 160, 160] + - [21, 10570.5] + - - [384, 2048, 1, 256, 416, 416, 288, 288] + - [23, 17270.9] + - - [384, 2048, 1, 512, 416, 416, 544, 544] + - [0, 23414.2] + - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] + - [22, 28715.8] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [2, 30728.1] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [22, 33564.9] + - - [384, 4096, 1, 64, 416, 416, 96, 96] + - [23, 9321.54] + - - [384, 4096, 1, 128, 416, 416, 160, 160] + - [23, 14657.9] + - - [384, 4096, 1, 256, 416, 416, 288, 288] + - [3, 20453.8] + - - [384, 4096, 1, 512, 416, 416, 544, 544] + - [23, 25958.3] + - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] + - [22, 28841.3] + - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] + - [11, 31910.1] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [22, 32506.7] + - - [768, 64, 1, 64, 800, 800, 96, 96] + - [3, 488.847] + - - [768, 64, 1, 128, 800, 800, 160, 160] + - [14, 902.389] + - - [768, 64, 1, 256, 800, 800, 288, 288] + - [7, 1538.82] + - - [768, 64, 1, 512, 800, 800, 544, 544] + - [9, 2232.6] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [22, 2967.66] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [22, 3708.08] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [9, 3979.81] + - - [768, 128, 1, 64, 800, 800, 96, 96] + - [29, 992.97] + - - [768, 128, 1, 128, 800, 800, 160, 160] + - [10, 1889.89] + - - [768, 128, 1, 256, 800, 800, 288, 288] + - [0, 3151.64] + - - [768, 128, 1, 512, 800, 800, 544, 544] + - [22, 4777.57] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [2, 6421.49] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [22, 7729.65] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [2, 8118.82] + - - [768, 256, 1, 64, 800, 800, 96, 96] + - [23, 2130.53] + - - [768, 256, 1, 128, 800, 800, 160, 160] + - [15, 3825.17] + - - [768, 256, 1, 256, 800, 800, 288, 288] + - [2, 6370.29] + - - [768, 256, 1, 512, 800, 800, 544, 544] + - [2, 9640.21] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [2, 12865.9] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [11, 15514.7] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [16, 16705.9] + - - [768, 512, 1, 64, 800, 800, 96, 96] + - [16, 3582.83] + - - [768, 512, 1, 128, 800, 800, 160, 160] + - [22, 6368.68] + - - [768, 512, 1, 256, 800, 800, 288, 288] + - [2, 10405.6] + - - [768, 512, 1, 512, 800, 800, 544, 544] + - [2, 15381.4] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [2, 20583.4] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [8, 24014.6] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [8, 25511.4] + - - [768, 1024, 1, 64, 800, 800, 96, 96] + - [10, 5691.05] + - - [768, 1024, 1, 128, 800, 800, 160, 160] + - [10, 10631.9] + - - [768, 1024, 1, 256, 800, 800, 288, 288] + - [3, 16513.0] + - - [768, 1024, 1, 512, 800, 800, 544, 544] + - [20, 22687.2] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [9, 27700.4] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [28, 30531.8] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [22, 33538.3] + - - [768, 2048, 1, 64, 800, 800, 96, 96] + - [28, 10093.6] + - - [768, 2048, 1, 128, 800, 800, 160, 160] + - [16, 16134.5] + - - [768, 2048, 1, 256, 800, 800, 288, 288] + - [23, 22714.1] + - - [768, 2048, 1, 512, 800, 800, 544, 544] + - [2, 28840.3] + - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] + - [22, 31995.9] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [22, 35301.5] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [22, 37370.5] + - - [768, 4096, 1, 64, 800, 800, 96, 96] + - [12, 12722.1] + - - [768, 4096, 1, 128, 800, 800, 160, 160] + - [2, 19122.0] + - - [768, 4096, 1, 256, 800, 800, 288, 288] + - [23, 25389.5] + - - [768, 4096, 1, 512, 800, 800, 544, 544] + - [2, 29809.0] + - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] + - [28, 34232.7] + - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] + - [28, 37178.1] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [28, 36702.3] + - - [1536, 64, 1, 64, 1568, 1568, 96, 96] + - [2, 927.67] + - - [1536, 64, 1, 128, 1568, 1568, 160, 160] + - [9, 1734.14] + - - [1536, 64, 1, 256, 1568, 1568, 288, 288] + - [0, 2950.96] + - - [1536, 64, 1, 512, 1568, 1568, 544, 544] + - [9, 4341.56] + - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] + - [2, 5924.16] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [2, 7453.77] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [22, 8251.09] + - - [1536, 128, 1, 64, 1568, 1568, 96, 96] + - [19, 1893.59] + - - [1536, 128, 1, 128, 1568, 1568, 160, 160] + - [10, 3619.94] + - - [1536, 128, 1, 256, 1568, 1568, 288, 288] + - [22, 6060.4] + - - [1536, 128, 1, 512, 1568, 1568, 544, 544] + - [11, 9258.08] + - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] + - [22, 12492.3] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [11, 15425.5] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [11, 17053.3] + - - [1536, 256, 1, 64, 1568, 1568, 96, 96] + - [24, 3418.81] + - - [1536, 256, 1, 128, 1568, 1568, 160, 160] + - [22, 6120.84] + - - [1536, 256, 1, 256, 1568, 1568, 288, 288] + - [22, 10016.2] + - - [1536, 256, 1, 512, 1568, 1568, 544, 544] + - [13, 14862.4] + - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] + - [25, 19838.0] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [25, 24074.2] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [4, 25641.8] + - - [1536, 512, 1, 64, 1568, 1568, 96, 96] + - [29, 5762.73] + - - [1536, 512, 1, 128, 1568, 1568, 160, 160] + - [2, 10113.9] + - - [1536, 512, 1, 256, 1568, 1568, 288, 288] + - [23, 17705.3] + - - [1536, 512, 1, 512, 1568, 1568, 544, 544] + - [2, 23522.2] + - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] + - [11, 28515.5] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [22, 30715.2] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [2, 33101.7] + - - [1536, 1024, 1, 64, 1568, 1568, 96, 96] + - [26, 9838.09] + - - [1536, 1024, 1, 128, 1568, 1568, 160, 160] + - [28, 15797.8] + - - [1536, 1024, 1, 256, 1568, 1568, 288, 288] + - [23, 22503.4] + - - [1536, 1024, 1, 512, 1568, 1568, 544, 544] + - [20, 28510.5] + - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] + - [22, 32034.1] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [22, 35425.7] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [22, 37301.5] + - - [1536, 2048, 1, 64, 1568, 1568, 96, 96] + - [23, 12454.5] + - - [1536, 2048, 1, 128, 1568, 1568, 160, 160] + - [23, 19030.8] + - - [1536, 2048, 1, 256, 1568, 1568, 288, 288] + - [29, 25477.1] + - - [1536, 2048, 1, 512, 1568, 1568, 544, 544] + - [20, 30053.8] + - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 1056] + - [28, 34385.4] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [28, 37109.2] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [11, 37440.0] + - - [1536, 4096, 1, 64, 1568, 1568, 96, 96] + - [12, 14424.3] + - - [1536, 4096, 1, 128, 1568, 1568, 160, 160] + - [3, 19898.3] + - - [1536, 4096, 1, 256, 1568, 1568, 288, 288] + - [23, 26513.5] + - - [1536, 4096, 1, 512, 1568, 1568, 544, 544] + - [22, 32001.3] + - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 1056] + - [28, 35774.7] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] + - [28, 37333.3] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [14, 37436.3] + - - [3072, 64, 1, 64, 3104, 3104, 96, 96] + - [25, 1746.17] + - - [3072, 64, 1, 128, 3104, 3104, 160, 160] + - [14, 3297.84] + - - [3072, 64, 1, 256, 3104, 3104, 288, 288] + - [13, 4905.14] + - - [3072, 64, 1, 512, 3104, 3104, 544, 544] + - [4, 7327.36] + - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] + - [25, 9869.43] + - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] + - [4, 11835.8] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [8, 12559.4] + - - [3072, 128, 1, 64, 3104, 3104, 96, 96] + - [11, 3175.1] + - - [3072, 128, 1, 128, 3104, 3104, 160, 160] + - [8, 6196.19] + - - [3072, 128, 1, 256, 3104, 3104, 288, 288] + - [25, 10195.8] + - - [3072, 128, 1, 512, 3104, 3104, 544, 544] + - [25, 15087.4] + - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] + - [25, 20395.8] + - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] + - [25, 24136.3] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [4, 25545.4] + - - [3072, 256, 1, 64, 3104, 3104, 96, 96] + - [13, 5372.15] + - - [3072, 256, 1, 128, 3104, 3104, 160, 160] + - [21, 9947.95] + - - [3072, 256, 1, 256, 3104, 3104, 288, 288] + - [29, 16228.1] + - - [3072, 256, 1, 512, 3104, 3104, 544, 544] + - [10, 21933.4] + - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] + - [12, 26465.1] + - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] + - [12, 28809.3] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [15, 30508.6] + - - [3072, 512, 1, 64, 3104, 3104, 96, 96] + - [14, 9767.45] + - - [3072, 512, 1, 128, 3104, 3104, 160, 160] + - [11, 15716.3] + - - [3072, 512, 1, 256, 3104, 3104, 288, 288] + - [12, 22378.3] + - - [3072, 512, 1, 512, 3104, 3104, 544, 544] + - [17, 28223.7] + - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] + - [23, 31268.6] + - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] + - [23, 34506.6] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [11, 35980.9] + - - [3072, 1024, 1, 64, 3104, 3104, 96, 96] + - [28, 12415.3] + - - [3072, 1024, 1, 128, 3104, 3104, 160, 160] + - [22, 18968.9] + - - [3072, 1024, 1, 256, 3104, 3104, 288, 288] + - [17, 25147.7] + - - [3072, 1024, 1, 512, 3104, 3104, 544, 544] + - [17, 29451.5] + - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] + - [22, 33769.0] + - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] + - [11, 36427.6] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [2, 36175.5] + - - [3072, 2048, 1, 64, 3104, 3104, 96, 96] + - [21, 14349.7] + - - [3072, 2048, 1, 128, 3104, 3104, 160, 160] + - [17, 19868.9] + - - [3072, 2048, 1, 256, 3104, 3104, 288, 288] + - [21, 26205.0] + - - [3072, 2048, 1, 512, 3104, 3104, 544, 544] + - [20, 31647.7] + - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 1056] + - [28, 35441.9] + - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] + - [28, 37186.9] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [2, 36606.5] + - - [3072, 4096, 1, 64, 3104, 3104, 96, 96] + - [23, 15083.2] + - - [3072, 4096, 1, 128, 3104, 3104, 160, 160] + - [23, 21854.5] + - - [3072, 4096, 1, 256, 3104, 3104, 288, 288] + - [23, 27887.5] + - - [3072, 4096, 1, 512, 3104, 3104, 544, 544] + - [28, 32940.6] + - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 1056] + - [28, 35740.6] + - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 2080] + - [16, 36999.3] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [13, 35682.4] + - - [4096, 64, 1, 64, 4128, 4128, 96, 96] + - [25, 1794.16] + - - [4096, 64, 1, 128, 4128, 4128, 160, 160] + - [13, 3624.36] + - - [4096, 64, 1, 256, 4128, 4128, 288, 288] + - [8, 6047.48] + - - [4096, 64, 1, 512, 4128, 4128, 544, 544] + - [18, 9277.51] + - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] + - [8, 12762.0] + - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] + - [13, 15591.3] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [18, 16424.1] + - - [4096, 128, 1, 64, 4128, 4128, 96, 96] + - [22, 5059.46] + - - [4096, 128, 1, 128, 4128, 4128, 160, 160] + - [11, 9081.04] + - - [4096, 128, 1, 256, 4128, 4128, 288, 288] + - [4, 14506.9] + - - [4096, 128, 1, 512, 4128, 4128, 544, 544] + - [28, 21203.4] + - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] + - [20, 27458.6] + - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] + - [11, 31860.8] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [25, 32544.0] + - - [4096, 256, 1, 64, 4128, 4128, 96, 96] + - [12, 7860.94] + - - [4096, 256, 1, 128, 4128, 4128, 160, 160] + - [29, 13104.6] + - - [4096, 256, 1, 256, 4128, 4128, 288, 288] + - [28, 19780.1] + - - [4096, 256, 1, 512, 4128, 4128, 544, 544] + - [23, 26154.4] + - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] + - [23, 30932.9] + - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] + - [22, 33237.6] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [4, 34493.3] + - - [4096, 512, 1, 64, 4128, 4128, 96, 96] + - [11, 10777.1] + - - [4096, 512, 1, 128, 4128, 4128, 160, 160] + - [9, 17045.7] + - - [4096, 512, 1, 256, 4128, 4128, 288, 288] + - [29, 23636.1] + - - [4096, 512, 1, 512, 4128, 4128, 544, 544] + - [12, 28937.1] + - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] + - [12, 32215.5] + - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] + - [28, 35312.6] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [25, 35733.6] + - - [4096, 1024, 1, 64, 4128, 4128, 96, 96] + - [27, 13237.8] + - - [4096, 1024, 1, 128, 4128, 4128, 160, 160] + - [29, 19938.0] + - - [4096, 1024, 1, 256, 4128, 4128, 288, 288] + - [23, 24864.9] + - - [4096, 1024, 1, 512, 4128, 4128, 544, 544] + - [20, 30627.6] + - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] + - [16, 34561.3] + - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] + - [28, 37052.7] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [22, 35549.9] + - - [4096, 2048, 1, 64, 4128, 4128, 96, 96] + - [23, 15046.8] + - - [4096, 2048, 1, 128, 4128, 4128, 160, 160] + - [23, 20856.2] + - - [4096, 2048, 1, 256, 4128, 4128, 288, 288] + - [23, 27016.4] + - - [4096, 2048, 1, 512, 4128, 4128, 544, 544] + - [22, 32297.4] + - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 1056] + - [28, 35954.1] + - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] + - [11, 37291.7] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [30, 35673.4] + - - [4096, 4096, 1, 64, 4128, 4128, 96, 96] + - [23, 15017.8] + - - [4096, 4096, 1, 128, 4128, 4128, 160, 160] + - [16, 21305.2] + - - [4096, 4096, 1, 256, 4128, 4128, 288, 288] + - [11, 27487.1] + - - [4096, 4096, 1, 512, 4128, 4128, 544, 544] + - [28, 33021.3] + - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 1056] + - [28, 35981.0] + - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 2080] + - [11, 37429.5] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [25, 35235.8] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_HB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_HB.yaml new file mode 100644 index 00000000000..c438eb2f918 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_HB.yaml @@ -0,0 +1,10833 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [9, 37.8602] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [14, 65.8325] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [25, 122.426] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [25, 179.859] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [14, 254.679] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [2, 319.883] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [7, 353.749] + - - [64, 128, 1, 64, 96, 96, 96, 96] + - [26, 70.5447] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [15, 131.615] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [15, 227.31] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [25, 358.457] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [14, 504.974] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [14, 633.581] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [19, 689.315] + - - [64, 256, 1, 64, 96, 96, 96, 96] + - [15, 152.898] + - - [64, 256, 1, 128, 96, 96, 160, 160] + - [15, 284.36] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [2, 485.509] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [25, 761.355] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [0, 1056.9] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [25, 1311.9] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [33, 1456.67] + - - [64, 512, 1, 64, 96, 96, 96, 96] + - [15, 340.835] + - - [64, 512, 1, 128, 96, 96, 160, 160] + - [33, 624.992] + - - [64, 512, 1, 256, 96, 96, 288, 288] + - [25, 1061.85] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [25, 1640.17] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [0, 2225.98] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [33, 2695.79] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [12, 3035.5] + - - [64, 1024, 1, 64, 96, 96, 96, 96] + - [33, 669.164] + - - [64, 1024, 1, 128, 96, 96, 160, 160] + - [8, 1225.33] + - - [64, 1024, 1, 256, 96, 96, 288, 288] + - [14, 2080.26] + - - [64, 1024, 1, 512, 96, 96, 544, 544] + - [0, 3221.12] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [25, 4408.1] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [23, 5438.1] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [23, 6197.29] + - - [64, 2048, 1, 64, 96, 96, 96, 96] + - [21, 1316.07] + - - [64, 2048, 1, 128, 96, 96, 160, 160] + - [25, 2525.18] + - - [64, 2048, 1, 256, 96, 96, 288, 288] + - [2, 4270.64] + - - [64, 2048, 1, 512, 96, 96, 544, 544] + - [0, 6549.14] + - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] + - [23, 8938.31] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [7, 11133.3] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [12, 12722.7] + - - [64, 4096, 1, 64, 96, 96, 96, 96] + - [35, 2364.99] + - - [64, 4096, 1, 128, 96, 96, 160, 160] + - [10, 4212.73] + - - [64, 4096, 1, 256, 96, 96, 288, 288] + - [21, 6840.18] + - - [64, 4096, 1, 512, 96, 96, 544, 544] + - [35, 10062.8] + - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] + - [14, 13495.3] + - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] + - [33, 16146.5] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [35, 16494.5] + - - [128, 64, 1, 64, 160, 160, 96, 96] + - [29, 79.4617] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [25, 150.399] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [25, 256.847] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [0, 375.162] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [2, 524.715] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [14, 620.782] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [8, 685.793] + - - [128, 128, 1, 64, 160, 160, 96, 96] + - [3, 177.274] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [26, 327.016] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [3, 567.567] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [25, 845.117] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [25, 1118.33] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [25, 1346.16] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [19, 1489.49] + - - [128, 256, 1, 64, 160, 160, 96, 96] + - [3, 398.547] + - - [128, 256, 1, 128, 160, 160, 160, 160] + - [9, 716.608] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [25, 1176.85] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [14, 1754.02] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [25, 2312.99] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [2, 2742.38] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [33, 2995.33] + - - [128, 512, 1, 64, 160, 160, 96, 96] + - [34, 810.024] + - - [128, 512, 1, 128, 160, 160, 160, 160] + - [3, 1445.81] + - - [128, 512, 1, 256, 160, 160, 288, 288] + - [0, 2378.73] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [7, 3539.12] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [19, 4630.11] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [7, 5484.1] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [8, 6080.64] + - - [128, 1024, 1, 64, 160, 160, 96, 96] + - [26, 1534.7] + - - [128, 1024, 1, 128, 160, 160, 160, 160] + - [9, 2731.11] + - - [128, 1024, 1, 256, 160, 160, 288, 288] + - [2, 4536.22] + - - [128, 1024, 1, 512, 160, 160, 544, 544] + - [8, 6866.76] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [0, 9172.26] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [31, 11117.7] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [31, 12540.2] + - - [128, 2048, 1, 64, 160, 160, 96, 96] + - [24, 2902.63] + - - [128, 2048, 1, 128, 160, 160, 160, 160] + - [18, 5224.11] + - - [128, 2048, 1, 256, 160, 160, 288, 288] + - [14, 8735.89] + - - [128, 2048, 1, 512, 160, 160, 544, 544] + - [0, 13252.1] + - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] + - [0, 18147.3] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [31, 22495.2] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [2, 25333.1] + - - [128, 4096, 1, 64, 160, 160, 96, 96] + - [35, 4672.02] + - - [128, 4096, 1, 128, 160, 160, 160, 160] + - [5, 8332.36] + - - [128, 4096, 1, 256, 160, 160, 288, 288] + - [21, 13577.9] + - - [128, 4096, 1, 512, 160, 160, 544, 544] + - [2, 20373.1] + - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] + - [14, 27107.9] + - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] + - [19, 32195.2] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [21, 33035.2] + - - [256, 64, 1, 64, 288, 288, 96, 96] + - [19, 169.098] + - - [256, 64, 1, 128, 288, 288, 160, 160] + - [25, 313.008] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [14, 531.395] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [33, 789.221] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [25, 1033.85] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [19, 1289.42] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [25, 1421.59] + - - [256, 128, 1, 64, 288, 288, 96, 96] + - [26, 390.749] + - - [256, 128, 1, 128, 288, 288, 160, 160] + - [26, 712.953] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [25, 1173.89] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [8, 1742.18] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [19, 2271.95] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [14, 2672.07] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [19, 2958.23] + - - [256, 256, 1, 64, 288, 288, 96, 96] + - [20, 772.289] + - - [256, 256, 1, 128, 288, 288, 160, 160] + - [9, 1378.12] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [14, 2276.11] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [33, 3401.71] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [25, 4504.56] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [2, 5361.64] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [14, 5954.12] + - - [256, 512, 1, 64, 288, 288, 96, 96] + - [26, 1529.37] + - - [256, 512, 1, 128, 288, 288, 160, 160] + - [24, 2745.87] + - - [256, 512, 1, 256, 288, 288, 288, 288] + - [25, 4544.21] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [25, 6805.48] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [14, 9055.92] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [2, 10734.9] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [8, 12059.7] + - - [256, 1024, 1, 64, 288, 288, 96, 96] + - [15, 2965.22] + - - [256, 1024, 1, 128, 288, 288, 160, 160] + - [26, 5321.9] + - - [256, 1024, 1, 256, 288, 288, 288, 288] + - [25, 8832.47] + - - [256, 1024, 1, 512, 288, 288, 544, 544] + - [14, 13335.1] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [14, 17805.5] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [7, 21915.8] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [33, 24428.2] + - - [256, 2048, 1, 64, 288, 288, 96, 96] + - [9, 4716.03] + - - [256, 2048, 1, 128, 288, 288, 160, 160] + - [20, 8423.35] + - - [256, 2048, 1, 256, 288, 288, 288, 288] + - [2, 13619.3] + - - [256, 2048, 1, 512, 288, 288, 544, 544] + - [29, 20285.3] + - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] + - [2, 27395.6] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [8, 31892.1] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [8, 33930.4] + - - [256, 4096, 1, 64, 288, 288, 96, 96] + - [22, 8287.09] + - - [256, 4096, 1, 128, 288, 288, 160, 160] + - [35, 13450.0] + - - [256, 4096, 1, 256, 288, 288, 288, 288] + - [14, 21338.3] + - - [256, 4096, 1, 512, 288, 288, 544, 544] + - [14, 27927.2] + - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] + - [14, 33082.0] + - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] + - [25, 34610.6] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [35, 34967.4] + - - [384, 64, 1, 64, 416, 416, 96, 96] + - [14, 272.689] + - - [384, 64, 1, 128, 416, 416, 160, 160] + - [14, 481.221] + - - [384, 64, 1, 256, 416, 416, 288, 288] + - [14, 850.772] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [2, 1233.5] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [0, 1625.38] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [8, 1955.69] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [14, 2147.85] + - - [384, 128, 1, 64, 416, 416, 96, 96] + - [20, 563.852] + - - [384, 128, 1, 128, 416, 416, 160, 160] + - [15, 1030.38] + - - [384, 128, 1, 256, 416, 416, 288, 288] + - [2, 1694.9] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [25, 2533.3] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [31, 3404.47] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [2, 4058.52] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [2, 4466.19] + - - [384, 256, 1, 64, 416, 416, 96, 96] + - [9, 1224.02] + - - [384, 256, 1, 128, 416, 416, 160, 160] + - [26, 2170.23] + - - [384, 256, 1, 256, 416, 416, 288, 288] + - [8, 3576.74] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [0, 5238.51] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [2, 6888.61] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [14, 8029.63] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [8, 8893.91] + - - [384, 512, 1, 64, 416, 416, 96, 96] + - [6, 2194.44] + - - [384, 512, 1, 128, 416, 416, 160, 160] + - [3, 3955.03] + - - [384, 512, 1, 256, 416, 416, 288, 288] + - [0, 6587.07] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [25, 9898.06] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [12, 13166.3] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [14, 15893.8] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [8, 17697.1] + - - [384, 1024, 1, 64, 416, 416, 96, 96] + - [9, 3731.59] + - - [384, 1024, 1, 128, 416, 416, 160, 160] + - [8, 6594.82] + - - [384, 1024, 1, 256, 416, 416, 288, 288] + - [14, 10668.0] + - - [384, 1024, 1, 512, 416, 416, 544, 544] + - [0, 15673.5] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [21, 20627.8] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [16, 24685.3] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [21, 25930.4] + - - [384, 2048, 1, 64, 416, 416, 96, 96] + - [15, 6765.92] + - - [384, 2048, 1, 128, 416, 416, 160, 160] + - [20, 12091.7] + - - [384, 2048, 1, 256, 416, 416, 288, 288] + - [24, 18677.7] + - - [384, 2048, 1, 512, 416, 416, 544, 544] + - [2, 24898.2] + - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] + - [25, 29818.4] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [25, 31668.8] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [33, 33968.5] + - - [384, 4096, 1, 64, 416, 416, 96, 96] + - [36, 10665.7] + - - [384, 4096, 1, 128, 416, 416, 160, 160] + - [26, 16547.0] + - - [384, 4096, 1, 256, 416, 416, 288, 288] + - [24, 22545.0] + - - [384, 4096, 1, 512, 416, 416, 544, 544] + - [23, 28061.4] + - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] + - [14, 30385.5] + - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] + - [2, 32994.6] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [25, 32801.1] + - - [768, 64, 1, 64, 800, 800, 96, 96] + - [14, 536.814] + - - [768, 64, 1, 128, 800, 800, 160, 160] + - [14, 984.428] + - - [768, 64, 1, 256, 800, 800, 288, 288] + - [14, 1675.93] + - - [768, 64, 1, 512, 800, 800, 544, 544] + - [17, 2488.46] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [33, 3245.11] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [33, 3920.83] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [14, 4339.41] + - - [768, 128, 1, 64, 800, 800, 96, 96] + - [1, 1183.28] + - - [768, 128, 1, 128, 800, 800, 160, 160] + - [26, 2059.4] + - - [768, 128, 1, 256, 800, 800, 288, 288] + - [25, 3388.88] + - - [768, 128, 1, 512, 800, 800, 544, 544] + - [2, 5063.55] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [14, 6704.19] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [2, 7986.31] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [2, 8856.15] + - - [768, 256, 1, 64, 800, 800, 96, 96] + - [11, 2193.29] + - - [768, 256, 1, 128, 800, 800, 160, 160] + - [3, 3941.4] + - - [768, 256, 1, 256, 800, 800, 288, 288] + - [0, 6520.51] + - - [768, 256, 1, 512, 800, 800, 544, 544] + - [14, 9854.46] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [14, 13130.3] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [14, 15814.5] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [2, 17650.2] + - - [768, 512, 1, 64, 800, 800, 96, 96] + - [24, 3741.57] + - - [768, 512, 1, 128, 800, 800, 160, 160] + - [12, 6590.5] + - - [768, 512, 1, 256, 800, 800, 288, 288] + - [25, 10966.7] + - - [768, 512, 1, 512, 800, 800, 544, 544] + - [8, 16063.7] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [10, 20924.7] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [10, 24520.6] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [24, 26012.5] + - - [768, 1024, 1, 64, 800, 800, 96, 96] + - [18, 6115.63] + - - [768, 1024, 1, 128, 800, 800, 160, 160] + - [18, 11615.9] + - - [768, 1024, 1, 256, 800, 800, 288, 288] + - [13, 17879.8] + - - [768, 1024, 1, 512, 800, 800, 544, 544] + - [25, 24815.3] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [25, 28872.4] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [33, 31021.1] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [33, 33823.3] + - - [768, 2048, 1, 64, 800, 800, 96, 96] + - [24, 11593.2] + - - [768, 2048, 1, 128, 800, 800, 160, 160] + - [34, 17838.7] + - - [768, 2048, 1, 256, 800, 800, 288, 288] + - [26, 24790.9] + - - [768, 2048, 1, 512, 800, 800, 544, 544] + - [2, 30657.3] + - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] + - [25, 33062.7] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [25, 35924.4] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [33, 37545.9] + - - [768, 4096, 1, 64, 800, 800, 96, 96] + - [15, 15421.4] + - - [768, 4096, 1, 128, 800, 800, 160, 160] + - [13, 22172.6] + - - [768, 4096, 1, 256, 800, 800, 288, 288] + - [26, 28660.7] + - - [768, 4096, 1, 512, 800, 800, 544, 544] + - [2, 32062.2] + - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] + - [33, 35550.9] + - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] + - [33, 37830.3] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [33, 37068.2] + - - [1536, 64, 1, 64, 1568, 1568, 96, 96] + - [8, 1066.17] + - - [1536, 64, 1, 128, 1568, 1568, 160, 160] + - [19, 1967.31] + - - [1536, 64, 1, 256, 1568, 1568, 288, 288] + - [19, 3327.94] + - - [1536, 64, 1, 512, 1568, 1568, 544, 544] + - [2, 5014.61] + - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] + - [25, 6433.81] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [2, 7917.83] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [17, 8640.82] + - - [1536, 128, 1, 64, 1568, 1568, 96, 96] + - [20, 2230.22] + - - [1536, 128, 1, 128, 1568, 1568, 160, 160] + - [34, 4043.35] + - - [1536, 128, 1, 256, 1568, 1568, 288, 288] + - [14, 6688.59] + - - [1536, 128, 1, 512, 1568, 1568, 544, 544] + - [12, 10009.3] + - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] + - [14, 13293.3] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [14, 16121.6] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [2, 17756.8] + - - [1536, 256, 1, 64, 1568, 1568, 96, 96] + - [32, 3849.77] + - - [1536, 256, 1, 128, 1568, 1568, 160, 160] + - [25, 6796.06] + - - [1536, 256, 1, 256, 1568, 1568, 288, 288] + - [29, 10933.3] + - - [1536, 256, 1, 512, 1568, 1568, 544, 544] + - [16, 16056.1] + - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] + - [29, 20863.9] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [5, 24714.8] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [5, 25907.0] + - - [1536, 512, 1, 64, 1568, 1568, 96, 96] + - [18, 5968.43] + - - [1536, 512, 1, 128, 1568, 1568, 160, 160] + - [34, 11440.3] + - - [1536, 512, 1, 256, 1568, 1568, 288, 288] + - [24, 17580.1] + - - [1536, 512, 1, 512, 1568, 1568, 544, 544] + - [18, 24077.8] + - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] + - [25, 28836.2] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [25, 30972.8] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [2, 33604.2] + - - [1536, 1024, 1, 64, 1568, 1568, 96, 96] + - [26, 11567.9] + - - [1536, 1024, 1, 128, 1568, 1568, 160, 160] + - [26, 17898.9] + - - [1536, 1024, 1, 256, 1568, 1568, 288, 288] + - [26, 24498.3] + - - [1536, 1024, 1, 512, 1568, 1568, 544, 544] + - [25, 30336.3] + - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] + - [25, 32934.2] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [25, 35982.9] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [25, 37712.9] + - - [1536, 2048, 1, 64, 1568, 1568, 96, 96] + - [24, 15650.4] + - - [1536, 2048, 1, 128, 1568, 1568, 160, 160] + - [1, 22274.3] + - - [1536, 2048, 1, 256, 1568, 1568, 288, 288] + - [34, 28297.1] + - - [1536, 2048, 1, 512, 1568, 1568, 544, 544] + - [23, 31688.2] + - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 1056] + - [33, 35575.6] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [25, 37895.5] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [19, 37782.0] + - - [1536, 4096, 1, 64, 1568, 1568, 96, 96] + - [32, 18261.7] + - - [1536, 4096, 1, 128, 1568, 1568, 160, 160] + - [34, 25219.4] + - - [1536, 4096, 1, 256, 1568, 1568, 288, 288] + - [34, 29638.0] + - - [1536, 4096, 1, 512, 1568, 1568, 544, 544] + - [25, 34177.5] + - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 1056] + - [33, 37186.6] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] + - [33, 38147.5] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [19, 38089.6] + - - [3072, 64, 1, 64, 3104, 3104, 96, 96] + - [27, 1989.71] + - - [3072, 64, 1, 128, 3104, 3104, 160, 160] + - [2, 3580.29] + - - [3072, 64, 1, 256, 3104, 3104, 288, 288] + - [0, 5465.5] + - - [3072, 64, 1, 512, 3104, 3104, 544, 544] + - [16, 7550.5] + - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] + - [29, 10205.1] + - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] + - [21, 12080.8] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [29, 12805.6] + - - [3072, 128, 1, 64, 3104, 3104, 96, 96] + - [28, 3579.27] + - - [3072, 128, 1, 128, 3104, 3104, 160, 160] + - [5, 6428.87] + - - [3072, 128, 1, 256, 3104, 3104, 288, 288] + - [16, 10488.0] + - - [3072, 128, 1, 512, 3104, 3104, 544, 544] + - [35, 15480.7] + - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] + - [29, 20956.3] + - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] + - [16, 24824.5] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [35, 26187.6] + - - [3072, 256, 1, 64, 3104, 3104, 96, 96] + - [13, 6271.09] + - - [3072, 256, 1, 128, 3104, 3104, 160, 160] + - [26, 11432.5] + - - [3072, 256, 1, 256, 3104, 3104, 288, 288] + - [18, 17802.4] + - - [3072, 256, 1, 512, 3104, 3104, 544, 544] + - [25, 22446.9] + - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] + - [24, 26876.7] + - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] + - [24, 28742.0] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [13, 30332.9] + - - [3072, 512, 1, 64, 3104, 3104, 96, 96] + - [13, 11508.4] + - - [3072, 512, 1, 128, 3104, 3104, 160, 160] + - [26, 17822.9] + - - [3072, 512, 1, 256, 3104, 3104, 288, 288] + - [34, 24313.4] + - - [3072, 512, 1, 512, 3104, 3104, 544, 544] + - [29, 30206.6] + - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] + - [16, 32411.9] + - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] + - [14, 35082.0] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [1, 36208.8] + - - [3072, 1024, 1, 64, 3104, 3104, 96, 96] + - [26, 15303.1] + - - [3072, 1024, 1, 128, 3104, 3104, 160, 160] + - [26, 22145.7] + - - [3072, 1024, 1, 256, 3104, 3104, 288, 288] + - [34, 28056.6] + - - [3072, 1024, 1, 512, 3104, 3104, 544, 544] + - [25, 31658.9] + - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] + - [33, 35095.0] + - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] + - [33, 37139.4] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [2, 36528.9] + - - [3072, 2048, 1, 64, 3104, 3104, 96, 96] + - [26, 18370.1] + - - [3072, 2048, 1, 128, 3104, 3104, 160, 160] + - [34, 25143.8] + - - [3072, 2048, 1, 256, 3104, 3104, 288, 288] + - [26, 29435.9] + - - [3072, 2048, 1, 512, 3104, 3104, 544, 544] + - [25, 33958.4] + - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 1056] + - [33, 36889.1] + - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] + - [33, 37955.9] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [0, 36818.5] + - - [3072, 4096, 1, 64, 3104, 3104, 96, 96] + - [24, 19499.9] + - - [3072, 4096, 1, 128, 3104, 3104, 160, 160] + - [26, 26247.8] + - - [3072, 4096, 1, 256, 3104, 3104, 288, 288] + - [34, 31503.5] + - - [3072, 4096, 1, 512, 3104, 3104, 544, 544] + - [33, 35406.7] + - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 1056] + - [33, 37185.2] + - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 2080] + - [33, 38019.2] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [2, 37061.7] + - - [4096, 64, 1, 64, 4128, 4128, 96, 96] + - [4, 2542.4] + - - [4096, 64, 1, 128, 4128, 4128, 160, 160] + - [29, 4318.47] + - - [4096, 64, 1, 256, 4128, 4128, 288, 288] + - [29, 7062.62] + - - [4096, 64, 1, 512, 4128, 4128, 544, 544] + - [21, 10302.3] + - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] + - [14, 13399.7] + - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] + - [21, 15871.1] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [21, 16690.9] + - - [4096, 128, 1, 64, 4128, 4128, 96, 96] + - [9, 5849.8] + - - [4096, 128, 1, 128, 4128, 4128, 160, 160] + - [25, 10163.4] + - - [4096, 128, 1, 256, 4128, 4128, 288, 288] + - [16, 16085.5] + - - [4096, 128, 1, 512, 4128, 4128, 544, 544] + - [25, 22880.6] + - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] + - [16, 28819.1] + - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] + - [5, 32859.3] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [10, 33157.1] + - - [4096, 256, 1, 64, 4128, 4128, 96, 96] + - [26, 9037.04] + - - [4096, 256, 1, 128, 4128, 4128, 160, 160] + - [26, 14737.9] + - - [4096, 256, 1, 256, 4128, 4128, 288, 288] + - [14, 21304.5] + - - [4096, 256, 1, 512, 4128, 4128, 544, 544] + - [25, 27288.4] + - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] + - [25, 31908.2] + - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] + - [16, 33568.6] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [21, 34921.6] + - - [4096, 512, 1, 64, 4128, 4128, 96, 96] + - [18, 13443.3] + - - [4096, 512, 1, 128, 4128, 4128, 160, 160] + - [34, 20121.1] + - - [4096, 512, 1, 256, 4128, 4128, 288, 288] + - [20, 25915.8] + - - [4096, 512, 1, 512, 4128, 4128, 544, 544] + - [33, 30881.3] + - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] + - [33, 33258.8] + - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] + - [25, 35831.5] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [16, 36485.9] + - - [4096, 1024, 1, 64, 4128, 4128, 96, 96] + - [34, 16655.5] + - - [4096, 1024, 1, 128, 4128, 4128, 160, 160] + - [26, 23495.5] + - - [4096, 1024, 1, 256, 4128, 4128, 288, 288] + - [32, 29021.7] + - - [4096, 1024, 1, 512, 4128, 4128, 544, 544] + - [25, 32596.0] + - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] + - [33, 35983.6] + - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] + - [33, 37809.4] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [29, 36173.4] + - - [4096, 2048, 1, 64, 4128, 4128, 96, 96] + - [32, 19307.1] + - - [4096, 2048, 1, 128, 4128, 4128, 160, 160] + - [14, 24275.3] + - - [4096, 2048, 1, 256, 4128, 4128, 288, 288] + - [26, 30362.6] + - - [4096, 2048, 1, 512, 4128, 4128, 544, 544] + - [33, 34695.6] + - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 1056] + - [33, 37399.8] + - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] + - [19, 38019.9] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [29, 36454.0] + - - [4096, 4096, 1, 64, 4128, 4128, 96, 96] + - [30, 18500.7] + - - [4096, 4096, 1, 128, 4128, 4128, 160, 160] + - [13, 24188.6] + - - [4096, 4096, 1, 256, 4128, 4128, 288, 288] + - [14, 30466.2] + - - [4096, 4096, 1, 512, 4128, 4128, 544, 544] + - [25, 35442.2] + - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 1056] + - [25, 37396.6] + - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 2080] + - [14, 38168.5] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [29, 36101.8] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_HB_GB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_HB_GB.yaml new file mode 100644 index 00000000000..7b1e6525ba7 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_HB_GB.yaml @@ -0,0 +1,10833 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 4 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_HB_MT128x128x32_MI16x16x16x1_SN_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [9, 37.8602] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [14, 65.8325] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [25, 122.426] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [25, 179.859] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [14, 254.679] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [2, 319.883] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [7, 353.749] + - - [64, 128, 1, 64, 96, 96, 96, 96] + - [26, 70.5447] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [15, 131.615] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [15, 227.31] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [25, 358.457] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [14, 504.974] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [14, 633.581] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [19, 689.315] + - - [64, 256, 1, 64, 96, 96, 96, 96] + - [15, 152.898] + - - [64, 256, 1, 128, 96, 96, 160, 160] + - [15, 284.36] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [2, 485.509] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [25, 761.355] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [0, 1056.9] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [25, 1311.9] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [33, 1456.67] + - - [64, 512, 1, 64, 96, 96, 96, 96] + - [15, 340.835] + - - [64, 512, 1, 128, 96, 96, 160, 160] + - [33, 624.992] + - - [64, 512, 1, 256, 96, 96, 288, 288] + - [25, 1061.85] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [25, 1640.17] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [0, 2225.98] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [33, 2695.79] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [12, 3035.5] + - - [64, 1024, 1, 64, 96, 96, 96, 96] + - [33, 669.164] + - - [64, 1024, 1, 128, 96, 96, 160, 160] + - [8, 1225.33] + - - [64, 1024, 1, 256, 96, 96, 288, 288] + - [14, 2080.26] + - - [64, 1024, 1, 512, 96, 96, 544, 544] + - [0, 3221.12] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [25, 4408.1] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [23, 5438.1] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [23, 6197.29] + - - [64, 2048, 1, 64, 96, 96, 96, 96] + - [21, 1316.07] + - - [64, 2048, 1, 128, 96, 96, 160, 160] + - [25, 2525.18] + - - [64, 2048, 1, 256, 96, 96, 288, 288] + - [2, 4270.64] + - - [64, 2048, 1, 512, 96, 96, 544, 544] + - [0, 6549.14] + - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] + - [23, 8938.31] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [7, 11133.3] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [12, 12722.7] + - - [64, 4096, 1, 64, 96, 96, 96, 96] + - [35, 2364.99] + - - [64, 4096, 1, 128, 96, 96, 160, 160] + - [10, 4212.73] + - - [64, 4096, 1, 256, 96, 96, 288, 288] + - [21, 6840.18] + - - [64, 4096, 1, 512, 96, 96, 544, 544] + - [35, 10062.8] + - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] + - [14, 13495.3] + - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] + - [33, 16146.5] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [35, 16494.5] + - - [128, 64, 1, 64, 160, 160, 96, 96] + - [29, 79.4617] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [25, 150.399] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [25, 256.847] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [0, 375.162] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [2, 524.715] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [14, 620.782] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [8, 685.793] + - - [128, 128, 1, 64, 160, 160, 96, 96] + - [3, 177.274] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [26, 327.016] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [3, 567.567] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [25, 845.117] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [25, 1118.33] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [25, 1346.16] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [19, 1489.49] + - - [128, 256, 1, 64, 160, 160, 96, 96] + - [3, 398.547] + - - [128, 256, 1, 128, 160, 160, 160, 160] + - [9, 716.608] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [25, 1176.85] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [14, 1754.02] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [25, 2312.99] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [2, 2742.38] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [33, 2995.33] + - - [128, 512, 1, 64, 160, 160, 96, 96] + - [34, 810.024] + - - [128, 512, 1, 128, 160, 160, 160, 160] + - [3, 1445.81] + - - [128, 512, 1, 256, 160, 160, 288, 288] + - [0, 2378.73] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [7, 3539.12] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [19, 4630.11] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [7, 5484.1] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [8, 6080.64] + - - [128, 1024, 1, 64, 160, 160, 96, 96] + - [26, 1534.7] + - - [128, 1024, 1, 128, 160, 160, 160, 160] + - [9, 2731.11] + - - [128, 1024, 1, 256, 160, 160, 288, 288] + - [2, 4536.22] + - - [128, 1024, 1, 512, 160, 160, 544, 544] + - [8, 6866.76] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [0, 9172.26] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [31, 11117.7] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [31, 12540.2] + - - [128, 2048, 1, 64, 160, 160, 96, 96] + - [24, 2902.63] + - - [128, 2048, 1, 128, 160, 160, 160, 160] + - [18, 5224.11] + - - [128, 2048, 1, 256, 160, 160, 288, 288] + - [14, 8735.89] + - - [128, 2048, 1, 512, 160, 160, 544, 544] + - [0, 13252.1] + - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] + - [0, 18147.3] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [31, 22495.2] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [2, 25333.1] + - - [128, 4096, 1, 64, 160, 160, 96, 96] + - [35, 4672.02] + - - [128, 4096, 1, 128, 160, 160, 160, 160] + - [5, 8332.36] + - - [128, 4096, 1, 256, 160, 160, 288, 288] + - [21, 13577.9] + - - [128, 4096, 1, 512, 160, 160, 544, 544] + - [2, 20373.1] + - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] + - [14, 27107.9] + - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] + - [19, 32195.2] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [21, 33035.2] + - - [256, 64, 1, 64, 288, 288, 96, 96] + - [19, 169.098] + - - [256, 64, 1, 128, 288, 288, 160, 160] + - [25, 313.008] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [14, 531.395] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [33, 789.221] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [25, 1033.85] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [19, 1289.42] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [25, 1421.59] + - - [256, 128, 1, 64, 288, 288, 96, 96] + - [26, 390.749] + - - [256, 128, 1, 128, 288, 288, 160, 160] + - [26, 712.953] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [25, 1173.89] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [8, 1742.18] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [19, 2271.95] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [14, 2672.07] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [19, 2958.23] + - - [256, 256, 1, 64, 288, 288, 96, 96] + - [20, 772.289] + - - [256, 256, 1, 128, 288, 288, 160, 160] + - [9, 1378.12] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [14, 2276.11] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [33, 3401.71] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [25, 4504.56] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [2, 5361.64] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [14, 5954.12] + - - [256, 512, 1, 64, 288, 288, 96, 96] + - [26, 1529.37] + - - [256, 512, 1, 128, 288, 288, 160, 160] + - [24, 2745.87] + - - [256, 512, 1, 256, 288, 288, 288, 288] + - [25, 4544.21] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [25, 6805.48] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [14, 9055.92] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [2, 10734.9] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [8, 12059.7] + - - [256, 1024, 1, 64, 288, 288, 96, 96] + - [15, 2965.22] + - - [256, 1024, 1, 128, 288, 288, 160, 160] + - [26, 5321.9] + - - [256, 1024, 1, 256, 288, 288, 288, 288] + - [25, 8832.47] + - - [256, 1024, 1, 512, 288, 288, 544, 544] + - [14, 13335.1] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [14, 17805.5] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [7, 21915.8] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [33, 24428.2] + - - [256, 2048, 1, 64, 288, 288, 96, 96] + - [9, 4716.03] + - - [256, 2048, 1, 128, 288, 288, 160, 160] + - [20, 8423.35] + - - [256, 2048, 1, 256, 288, 288, 288, 288] + - [2, 13619.3] + - - [256, 2048, 1, 512, 288, 288, 544, 544] + - [29, 20285.3] + - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] + - [2, 27395.6] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [8, 31892.1] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [8, 33930.4] + - - [256, 4096, 1, 64, 288, 288, 96, 96] + - [22, 8287.09] + - - [256, 4096, 1, 128, 288, 288, 160, 160] + - [35, 13450.0] + - - [256, 4096, 1, 256, 288, 288, 288, 288] + - [14, 21338.3] + - - [256, 4096, 1, 512, 288, 288, 544, 544] + - [14, 27927.2] + - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] + - [14, 33082.0] + - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] + - [25, 34610.6] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [35, 34967.4] + - - [384, 64, 1, 64, 416, 416, 96, 96] + - [14, 272.689] + - - [384, 64, 1, 128, 416, 416, 160, 160] + - [14, 481.221] + - - [384, 64, 1, 256, 416, 416, 288, 288] + - [14, 850.772] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [2, 1233.5] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [0, 1625.38] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [8, 1955.69] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [14, 2147.85] + - - [384, 128, 1, 64, 416, 416, 96, 96] + - [20, 563.852] + - - [384, 128, 1, 128, 416, 416, 160, 160] + - [15, 1030.38] + - - [384, 128, 1, 256, 416, 416, 288, 288] + - [2, 1694.9] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [25, 2533.3] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [31, 3404.47] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [2, 4058.52] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [2, 4466.19] + - - [384, 256, 1, 64, 416, 416, 96, 96] + - [9, 1224.02] + - - [384, 256, 1, 128, 416, 416, 160, 160] + - [26, 2170.23] + - - [384, 256, 1, 256, 416, 416, 288, 288] + - [8, 3576.74] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [0, 5238.51] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [2, 6888.61] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [14, 8029.63] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [8, 8893.91] + - - [384, 512, 1, 64, 416, 416, 96, 96] + - [6, 2194.44] + - - [384, 512, 1, 128, 416, 416, 160, 160] + - [3, 3955.03] + - - [384, 512, 1, 256, 416, 416, 288, 288] + - [0, 6587.07] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [25, 9898.06] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [12, 13166.3] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [14, 15893.8] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [8, 17697.1] + - - [384, 1024, 1, 64, 416, 416, 96, 96] + - [9, 3731.59] + - - [384, 1024, 1, 128, 416, 416, 160, 160] + - [8, 6594.82] + - - [384, 1024, 1, 256, 416, 416, 288, 288] + - [14, 10668.0] + - - [384, 1024, 1, 512, 416, 416, 544, 544] + - [0, 15673.5] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [21, 20627.8] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [16, 24685.3] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [21, 25930.4] + - - [384, 2048, 1, 64, 416, 416, 96, 96] + - [15, 6765.92] + - - [384, 2048, 1, 128, 416, 416, 160, 160] + - [20, 12091.7] + - - [384, 2048, 1, 256, 416, 416, 288, 288] + - [24, 18677.7] + - - [384, 2048, 1, 512, 416, 416, 544, 544] + - [2, 24898.2] + - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] + - [25, 29818.4] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [25, 31668.8] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [33, 33968.5] + - - [384, 4096, 1, 64, 416, 416, 96, 96] + - [36, 10665.7] + - - [384, 4096, 1, 128, 416, 416, 160, 160] + - [26, 16547.0] + - - [384, 4096, 1, 256, 416, 416, 288, 288] + - [24, 22545.0] + - - [384, 4096, 1, 512, 416, 416, 544, 544] + - [23, 28061.4] + - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] + - [14, 30385.5] + - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] + - [2, 32994.6] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [25, 32801.1] + - - [768, 64, 1, 64, 800, 800, 96, 96] + - [14, 536.814] + - - [768, 64, 1, 128, 800, 800, 160, 160] + - [14, 984.428] + - - [768, 64, 1, 256, 800, 800, 288, 288] + - [14, 1675.93] + - - [768, 64, 1, 512, 800, 800, 544, 544] + - [17, 2488.46] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [33, 3245.11] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [33, 3920.83] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [14, 4339.41] + - - [768, 128, 1, 64, 800, 800, 96, 96] + - [1, 1183.28] + - - [768, 128, 1, 128, 800, 800, 160, 160] + - [26, 2059.4] + - - [768, 128, 1, 256, 800, 800, 288, 288] + - [25, 3388.88] + - - [768, 128, 1, 512, 800, 800, 544, 544] + - [2, 5063.55] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [14, 6704.19] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [2, 7986.31] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [2, 8856.15] + - - [768, 256, 1, 64, 800, 800, 96, 96] + - [11, 2193.29] + - - [768, 256, 1, 128, 800, 800, 160, 160] + - [3, 3941.4] + - - [768, 256, 1, 256, 800, 800, 288, 288] + - [0, 6520.51] + - - [768, 256, 1, 512, 800, 800, 544, 544] + - [14, 9854.46] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [14, 13130.3] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [14, 15814.5] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [2, 17650.2] + - - [768, 512, 1, 64, 800, 800, 96, 96] + - [24, 3741.57] + - - [768, 512, 1, 128, 800, 800, 160, 160] + - [12, 6590.5] + - - [768, 512, 1, 256, 800, 800, 288, 288] + - [25, 10966.7] + - - [768, 512, 1, 512, 800, 800, 544, 544] + - [8, 16063.7] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [10, 20924.7] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [10, 24520.6] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [24, 26012.5] + - - [768, 1024, 1, 64, 800, 800, 96, 96] + - [18, 6115.63] + - - [768, 1024, 1, 128, 800, 800, 160, 160] + - [18, 11615.9] + - - [768, 1024, 1, 256, 800, 800, 288, 288] + - [13, 17879.8] + - - [768, 1024, 1, 512, 800, 800, 544, 544] + - [25, 24815.3] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [25, 28872.4] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [33, 31021.1] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [33, 33823.3] + - - [768, 2048, 1, 64, 800, 800, 96, 96] + - [24, 11593.2] + - - [768, 2048, 1, 128, 800, 800, 160, 160] + - [34, 17838.7] + - - [768, 2048, 1, 256, 800, 800, 288, 288] + - [26, 24790.9] + - - [768, 2048, 1, 512, 800, 800, 544, 544] + - [2, 30657.3] + - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] + - [25, 33062.7] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [25, 35924.4] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [33, 37545.9] + - - [768, 4096, 1, 64, 800, 800, 96, 96] + - [15, 15421.4] + - - [768, 4096, 1, 128, 800, 800, 160, 160] + - [13, 22172.6] + - - [768, 4096, 1, 256, 800, 800, 288, 288] + - [26, 28660.7] + - - [768, 4096, 1, 512, 800, 800, 544, 544] + - [2, 32062.2] + - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] + - [33, 35550.9] + - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] + - [33, 37830.3] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [33, 37068.2] + - - [1536, 64, 1, 64, 1568, 1568, 96, 96] + - [8, 1066.17] + - - [1536, 64, 1, 128, 1568, 1568, 160, 160] + - [19, 1967.31] + - - [1536, 64, 1, 256, 1568, 1568, 288, 288] + - [19, 3327.94] + - - [1536, 64, 1, 512, 1568, 1568, 544, 544] + - [2, 5014.61] + - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] + - [25, 6433.81] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [2, 7917.83] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [17, 8640.82] + - - [1536, 128, 1, 64, 1568, 1568, 96, 96] + - [20, 2230.22] + - - [1536, 128, 1, 128, 1568, 1568, 160, 160] + - [34, 4043.35] + - - [1536, 128, 1, 256, 1568, 1568, 288, 288] + - [14, 6688.59] + - - [1536, 128, 1, 512, 1568, 1568, 544, 544] + - [12, 10009.3] + - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] + - [14, 13293.3] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [14, 16121.6] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [2, 17756.8] + - - [1536, 256, 1, 64, 1568, 1568, 96, 96] + - [32, 3849.77] + - - [1536, 256, 1, 128, 1568, 1568, 160, 160] + - [25, 6796.06] + - - [1536, 256, 1, 256, 1568, 1568, 288, 288] + - [29, 10933.3] + - - [1536, 256, 1, 512, 1568, 1568, 544, 544] + - [16, 16056.1] + - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] + - [29, 20863.9] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [5, 24714.8] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [5, 25907.0] + - - [1536, 512, 1, 64, 1568, 1568, 96, 96] + - [18, 5968.43] + - - [1536, 512, 1, 128, 1568, 1568, 160, 160] + - [34, 11440.3] + - - [1536, 512, 1, 256, 1568, 1568, 288, 288] + - [24, 17580.1] + - - [1536, 512, 1, 512, 1568, 1568, 544, 544] + - [18, 24077.8] + - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] + - [25, 28836.2] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [25, 30972.8] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [2, 33604.2] + - - [1536, 1024, 1, 64, 1568, 1568, 96, 96] + - [26, 11567.9] + - - [1536, 1024, 1, 128, 1568, 1568, 160, 160] + - [26, 17898.9] + - - [1536, 1024, 1, 256, 1568, 1568, 288, 288] + - [26, 24498.3] + - - [1536, 1024, 1, 512, 1568, 1568, 544, 544] + - [25, 30336.3] + - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] + - [25, 32934.2] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [25, 35982.9] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [25, 37712.9] + - - [1536, 2048, 1, 64, 1568, 1568, 96, 96] + - [24, 15650.4] + - - [1536, 2048, 1, 128, 1568, 1568, 160, 160] + - [1, 22274.3] + - - [1536, 2048, 1, 256, 1568, 1568, 288, 288] + - [34, 28297.1] + - - [1536, 2048, 1, 512, 1568, 1568, 544, 544] + - [23, 31688.2] + - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 1056] + - [33, 35575.6] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [25, 37895.5] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [19, 37782.0] + - - [1536, 4096, 1, 64, 1568, 1568, 96, 96] + - [32, 18261.7] + - - [1536, 4096, 1, 128, 1568, 1568, 160, 160] + - [34, 25219.4] + - - [1536, 4096, 1, 256, 1568, 1568, 288, 288] + - [34, 29638.0] + - - [1536, 4096, 1, 512, 1568, 1568, 544, 544] + - [25, 34177.5] + - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 1056] + - [33, 37186.6] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] + - [33, 38147.5] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [19, 38089.6] + - - [3072, 64, 1, 64, 3104, 3104, 96, 96] + - [27, 1989.71] + - - [3072, 64, 1, 128, 3104, 3104, 160, 160] + - [2, 3580.29] + - - [3072, 64, 1, 256, 3104, 3104, 288, 288] + - [0, 5465.5] + - - [3072, 64, 1, 512, 3104, 3104, 544, 544] + - [16, 7550.5] + - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] + - [29, 10205.1] + - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] + - [21, 12080.8] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [29, 12805.6] + - - [3072, 128, 1, 64, 3104, 3104, 96, 96] + - [28, 3579.27] + - - [3072, 128, 1, 128, 3104, 3104, 160, 160] + - [5, 6428.87] + - - [3072, 128, 1, 256, 3104, 3104, 288, 288] + - [16, 10488.0] + - - [3072, 128, 1, 512, 3104, 3104, 544, 544] + - [35, 15480.7] + - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] + - [29, 20956.3] + - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] + - [16, 24824.5] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [35, 26187.6] + - - [3072, 256, 1, 64, 3104, 3104, 96, 96] + - [13, 6271.09] + - - [3072, 256, 1, 128, 3104, 3104, 160, 160] + - [26, 11432.5] + - - [3072, 256, 1, 256, 3104, 3104, 288, 288] + - [18, 17802.4] + - - [3072, 256, 1, 512, 3104, 3104, 544, 544] + - [25, 22446.9] + - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] + - [24, 26876.7] + - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] + - [24, 28742.0] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [13, 30332.9] + - - [3072, 512, 1, 64, 3104, 3104, 96, 96] + - [13, 11508.4] + - - [3072, 512, 1, 128, 3104, 3104, 160, 160] + - [26, 17822.9] + - - [3072, 512, 1, 256, 3104, 3104, 288, 288] + - [34, 24313.4] + - - [3072, 512, 1, 512, 3104, 3104, 544, 544] + - [29, 30206.6] + - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] + - [16, 32411.9] + - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] + - [14, 35082.0] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [1, 36208.8] + - - [3072, 1024, 1, 64, 3104, 3104, 96, 96] + - [26, 15303.1] + - - [3072, 1024, 1, 128, 3104, 3104, 160, 160] + - [26, 22145.7] + - - [3072, 1024, 1, 256, 3104, 3104, 288, 288] + - [34, 28056.6] + - - [3072, 1024, 1, 512, 3104, 3104, 544, 544] + - [25, 31658.9] + - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] + - [33, 35095.0] + - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] + - [33, 37139.4] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [2, 36528.9] + - - [3072, 2048, 1, 64, 3104, 3104, 96, 96] + - [26, 18370.1] + - - [3072, 2048, 1, 128, 3104, 3104, 160, 160] + - [34, 25143.8] + - - [3072, 2048, 1, 256, 3104, 3104, 288, 288] + - [26, 29435.9] + - - [3072, 2048, 1, 512, 3104, 3104, 544, 544] + - [25, 33958.4] + - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 1056] + - [33, 36889.1] + - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] + - [33, 37955.9] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [0, 36818.5] + - - [3072, 4096, 1, 64, 3104, 3104, 96, 96] + - [24, 19499.9] + - - [3072, 4096, 1, 128, 3104, 3104, 160, 160] + - [26, 26247.8] + - - [3072, 4096, 1, 256, 3104, 3104, 288, 288] + - [34, 31503.5] + - - [3072, 4096, 1, 512, 3104, 3104, 544, 544] + - [33, 35406.7] + - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 1056] + - [33, 37185.2] + - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 2080] + - [33, 38019.2] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [2, 37061.7] + - - [4096, 64, 1, 64, 4128, 4128, 96, 96] + - [4, 2542.4] + - - [4096, 64, 1, 128, 4128, 4128, 160, 160] + - [29, 4318.47] + - - [4096, 64, 1, 256, 4128, 4128, 288, 288] + - [29, 7062.62] + - - [4096, 64, 1, 512, 4128, 4128, 544, 544] + - [21, 10302.3] + - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] + - [14, 13399.7] + - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] + - [21, 15871.1] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [21, 16690.9] + - - [4096, 128, 1, 64, 4128, 4128, 96, 96] + - [9, 5849.8] + - - [4096, 128, 1, 128, 4128, 4128, 160, 160] + - [25, 10163.4] + - - [4096, 128, 1, 256, 4128, 4128, 288, 288] + - [16, 16085.5] + - - [4096, 128, 1, 512, 4128, 4128, 544, 544] + - [25, 22880.6] + - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] + - [16, 28819.1] + - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] + - [5, 32859.3] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [10, 33157.1] + - - [4096, 256, 1, 64, 4128, 4128, 96, 96] + - [26, 9037.04] + - - [4096, 256, 1, 128, 4128, 4128, 160, 160] + - [26, 14737.9] + - - [4096, 256, 1, 256, 4128, 4128, 288, 288] + - [14, 21304.5] + - - [4096, 256, 1, 512, 4128, 4128, 544, 544] + - [25, 27288.4] + - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] + - [25, 31908.2] + - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] + - [16, 33568.6] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [21, 34921.6] + - - [4096, 512, 1, 64, 4128, 4128, 96, 96] + - [18, 13443.3] + - - [4096, 512, 1, 128, 4128, 4128, 160, 160] + - [34, 20121.1] + - - [4096, 512, 1, 256, 4128, 4128, 288, 288] + - [20, 25915.8] + - - [4096, 512, 1, 512, 4128, 4128, 544, 544] + - [33, 30881.3] + - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] + - [33, 33258.8] + - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] + - [25, 35831.5] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [16, 36485.9] + - - [4096, 1024, 1, 64, 4128, 4128, 96, 96] + - [34, 16655.5] + - - [4096, 1024, 1, 128, 4128, 4128, 160, 160] + - [26, 23495.5] + - - [4096, 1024, 1, 256, 4128, 4128, 288, 288] + - [32, 29021.7] + - - [4096, 1024, 1, 512, 4128, 4128, 544, 544] + - [25, 32596.0] + - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] + - [33, 35983.6] + - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] + - [33, 37809.4] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [29, 36173.4] + - - [4096, 2048, 1, 64, 4128, 4128, 96, 96] + - [32, 19307.1] + - - [4096, 2048, 1, 128, 4128, 4128, 160, 160] + - [14, 24275.3] + - - [4096, 2048, 1, 256, 4128, 4128, 288, 288] + - [26, 30362.6] + - - [4096, 2048, 1, 512, 4128, 4128, 544, 544] + - [33, 34695.6] + - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 1056] + - [33, 37399.8] + - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] + - [19, 38019.9] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [29, 36454.0] + - - [4096, 4096, 1, 64, 4128, 4128, 96, 96] + - [30, 18500.7] + - - [4096, 4096, 1, 128, 4128, 4128, 160, 160] + - [13, 24188.6] + - - [4096, 4096, 1, 256, 4128, 4128, 288, 288] + - [14, 30466.2] + - - [4096, 4096, 1, 512, 4128, 4128, 544, 544] + - [25, 35442.2] + - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 1056] + - [25, 37396.6] + - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 2080] + - [14, 38168.5] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [29, 36101.8] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_HHS_BH.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_HHS_BH.yaml new file mode 100644 index 00000000000..252d4ed77b0 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_HHS_BH.yaml @@ -0,0 +1,15423 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [8, 36.7457] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [20, 68.6421] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [29, 111.931] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [39, 175.083] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [39, 247.116] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [2, 310.505] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [2, 340.267] + - - [64, 128, 1, 64, 96, 96, 96, 96] + - [40, 65.5114] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [21, 121.913] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [3, 220.312] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [39, 349.234] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [6, 491.856] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [24, 618.149] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [33, 679.307] + - - [64, 256, 1, 64, 96, 96, 96, 96] + - [21, 151.99] + - - [64, 256, 1, 128, 96, 96, 160, 160] + - [6, 279.582] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [6, 479.567] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [2, 745.52] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [39, 1027.01] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [39, 1260.26] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [43, 1410.03] + - - [64, 512, 1, 64, 96, 96, 96, 96] + - [12, 305.795] + - - [64, 512, 1, 128, 96, 96, 160, 160] + - [20, 562.013] + - - [64, 512, 1, 256, 96, 96, 288, 288] + - [39, 962.548] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [2, 1496.89] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [2, 2078.96] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [24, 2569.05] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [24, 2891.68] + - - [64, 1024, 1, 64, 96, 96, 96, 96] + - [20, 622.3] + - - [64, 1024, 1, 128, 96, 96, 160, 160] + - [48, 1147.71] + - - [64, 1024, 1, 256, 96, 96, 288, 288] + - [20, 1956.06] + - - [64, 1024, 1, 512, 96, 96, 544, 544] + - [31, 3043.76] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [51, 4295.79] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [41, 5258.89] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [11, 5969.6] + - - [64, 2048, 1, 64, 96, 96, 96, 96] + - [20, 1227.66] + - - [64, 2048, 1, 128, 96, 96, 160, 160] + - [39, 2253.48] + - - [64, 2048, 1, 256, 96, 96, 288, 288] + - [37, 3857.27] + - - [64, 2048, 1, 512, 96, 96, 544, 544] + - [19, 6011.17] + - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] + - [4, 8396.99] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [51, 10595.4] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [39, 12061.8] + - - [64, 4096, 1, 64, 96, 96, 96, 96] + - [26, 2226.57] + - - [64, 4096, 1, 128, 96, 96, 160, 160] + - [27, 3942.02] + - - [64, 4096, 1, 256, 96, 96, 288, 288] + - [9, 6505.95] + - - [64, 4096, 1, 512, 96, 96, 544, 544] + - [4, 9536.56] + - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] + - [33, 12933.5] + - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] + - [33, 15681.0] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [46, 16253.5] + - - [128, 64, 1, 64, 160, 160, 96, 96] + - [7, 73.833] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [20, 135.633] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [0, 233.509] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [20, 344.955] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [24, 486.267] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [6, 611.525] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [15, 666.807] + - - [128, 128, 1, 64, 160, 160, 96, 96] + - [21, 178.603] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [7, 331.723] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [2, 547.557] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [6, 824.35] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [43, 1096.84] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [51, 1317.2] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [43, 1409.91] + - - [128, 256, 1, 64, 160, 160, 96, 96] + - [47, 368.437] + - - [128, 256, 1, 128, 160, 160, 160, 160] + - [3, 662.189] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [39, 1096.26] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [43, 1651.95] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [24, 2200.28] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [6, 2639.9] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [6, 2887.08] + - - [128, 512, 1, 64, 160, 160, 96, 96] + - [7, 798.611] + - - [128, 512, 1, 128, 160, 160, 160, 160] + - [23, 1415.08] + - - [128, 512, 1, 256, 160, 160, 288, 288] + - [43, 2341.22] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [20, 3489.44] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [43, 4564.91] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [6, 5405.67] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [33, 5917.36] + - - [128, 1024, 1, 64, 160, 160, 96, 96] + - [52, 1583.05] + - - [128, 1024, 1, 128, 160, 160, 160, 160] + - [7, 2834.47] + - - [128, 1024, 1, 256, 160, 160, 288, 288] + - [29, 4702.13] + - - [128, 1024, 1, 512, 160, 160, 544, 544] + - [29, 6981.04] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [51, 9150.99] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [20, 10891.2] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [43, 12325.1] + - - [128, 2048, 1, 64, 160, 160, 96, 96] + - [44, 2882.68] + - - [128, 2048, 1, 128, 160, 160, 160, 160] + - [40, 5170.97] + - - [128, 2048, 1, 256, 160, 160, 288, 288] + - [39, 8593.78] + - - [128, 2048, 1, 512, 160, 160, 544, 544] + - [51, 12941.6] + - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] + - [20, 17751.3] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [2, 21555.8] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [15, 24389.9] + - - [128, 4096, 1, 64, 160, 160, 96, 96] + - [9, 4599.63] + - - [128, 4096, 1, 128, 160, 160, 160, 160] + - [45, 8503.38] + - - [128, 4096, 1, 256, 160, 160, 288, 288] + - [18, 13859.7] + - - [128, 4096, 1, 512, 160, 160, 544, 544] + - [29, 20533.5] + - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] + - [15, 27450.2] + - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] + - [6, 32243.5] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [46, 32669.2] + - - [256, 64, 1, 64, 288, 288, 96, 96] + - [21, 159.99] + - - [256, 64, 1, 128, 288, 288, 160, 160] + - [2, 296.543] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [20, 485.675] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [6, 717.832] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [43, 1018.84] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [24, 1259.41] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [6, 1335.98] + - - [256, 128, 1, 64, 288, 288, 96, 96] + - [52, 376.508] + - - [256, 128, 1, 128, 288, 288, 160, 160] + - [7, 693.847] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [20, 1141.15] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [6, 1696.21] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [6, 2236.21] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [6, 2657.77] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [43, 2900.5] + - - [256, 256, 1, 64, 288, 288, 96, 96] + - [16, 795.427] + - - [256, 256, 1, 128, 288, 288, 160, 160] + - [5, 1415.8] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [43, 2329.51] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [2, 3449.25] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [33, 4417.96] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [6, 5296.04] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [6, 5854.77] + - - [256, 512, 1, 64, 288, 288, 96, 96] + - [21, 1523.81] + - - [256, 512, 1, 128, 288, 288, 160, 160] + - [25, 2709.06] + - - [256, 512, 1, 256, 288, 288, 288, 288] + - [39, 4698.83] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [24, 6919.13] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [51, 9062.03] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [24, 10725.4] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [6, 11877.7] + - - [256, 1024, 1, 64, 288, 288, 96, 96] + - [47, 2951.14] + - - [256, 1024, 1, 128, 288, 288, 160, 160] + - [25, 5266.74] + - - [256, 1024, 1, 256, 288, 288, 288, 288] + - [20, 8731.31] + - - [256, 1024, 1, 512, 288, 288, 544, 544] + - [43, 13153.4] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [24, 17491.0] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [15, 21173.3] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [15, 23730.1] + - - [256, 2048, 1, 64, 288, 288, 96, 96] + - [40, 4922.17] + - - [256, 2048, 1, 128, 288, 288, 160, 160] + - [42, 8645.79] + - - [256, 2048, 1, 256, 288, 288, 288, 288] + - [24, 13915.8] + - - [256, 2048, 1, 512, 288, 288, 544, 544] + - [39, 20629.8] + - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] + - [0, 27165.4] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [6, 31676.6] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [24, 33904.5] + - - [256, 4096, 1, 64, 288, 288, 96, 96] + - [17, 8928.77] + - - [256, 4096, 1, 128, 288, 288, 160, 160] + - [25, 14966.3] + - - [256, 4096, 1, 256, 288, 288, 288, 288] + - [7, 21681.2] + - - [256, 4096, 1, 512, 288, 288, 544, 544] + - [6, 28146.7] + - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] + - [6, 32918.6] + - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] + - [43, 34272.5] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [53, 34792.5] + - - [384, 64, 1, 64, 416, 416, 96, 96] + - [11, 255.667] + - - [384, 64, 1, 128, 416, 416, 160, 160] + - [2, 470.847] + - - [384, 64, 1, 256, 416, 416, 288, 288] + - [20, 765.847] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [48, 1123.98] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [48, 1548.09] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [15, 1901.1] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [33, 2091.7] + - - [384, 128, 1, 64, 416, 416, 96, 96] + - [49, 547.464] + - - [384, 128, 1, 128, 416, 416, 160, 160] + - [40, 1046.31] + - - [384, 128, 1, 256, 416, 416, 288, 288] + - [3, 1728.42] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [24, 2559.32] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [51, 3348.52] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [24, 4000.6] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [24, 4340.61] + - - [384, 256, 1, 64, 416, 416, 96, 96] + - [21, 1145.78] + - - [384, 256, 1, 128, 416, 416, 160, 160] + - [44, 2036.72] + - - [384, 256, 1, 256, 416, 416, 288, 288] + - [2, 3352.31] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [43, 5011.6] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [6, 6638.73] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [43, 7912.21] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [43, 8765.9] + - - [384, 512, 1, 64, 416, 416, 96, 96] + - [21, 2307.94] + - - [384, 512, 1, 128, 416, 416, 160, 160] + - [7, 4084.68] + - - [384, 512, 1, 256, 416, 416, 288, 288] + - [0, 6471.84] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [6, 9722.14] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [24, 12959.5] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [24, 15626.1] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [43, 17407.9] + - - [384, 1024, 1, 64, 416, 416, 96, 96] + - [40, 3699.75] + - - [384, 1024, 1, 128, 416, 416, 160, 160] + - [53, 6651.44] + - - [384, 1024, 1, 256, 416, 416, 288, 288] + - [4, 10926.2] + - - [384, 1024, 1, 512, 416, 416, 544, 544] + - [6, 15931.5] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [6, 20872.6] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [9, 24281.0] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [43, 25732.7] + - - [384, 2048, 1, 64, 416, 416, 96, 96] + - [49, 6064.77] + - - [384, 2048, 1, 128, 416, 416, 160, 160] + - [33, 11356.4] + - - [384, 2048, 1, 256, 416, 416, 288, 288] + - [37, 18214.6] + - - [384, 2048, 1, 512, 416, 416, 544, 544] + - [51, 24514.6] + - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] + - [43, 29736.9] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [6, 31446.8] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [51, 33687.0] + - - [384, 4096, 1, 64, 416, 416, 96, 96] + - [52, 10806.6] + - - [384, 4096, 1, 128, 416, 416, 160, 160] + - [44, 16615.2] + - - [384, 4096, 1, 256, 416, 416, 288, 288] + - [44, 22539.9] + - - [384, 4096, 1, 512, 416, 416, 544, 544] + - [44, 27574.2] + - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] + - [25, 29728.7] + - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] + - [50, 32596.5] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [51, 32614.7] + - - [768, 64, 1, 64, 800, 800, 96, 96] + - [30, 499.877] + - - [768, 64, 1, 128, 800, 800, 160, 160] + - [12, 923.313] + - - [768, 64, 1, 256, 800, 800, 288, 288] + - [13, 1497.78] + - - [768, 64, 1, 512, 800, 800, 544, 544] + - [24, 2229.43] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [6, 3067.31] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [43, 3766.63] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [19, 4000.92] + - - [768, 128, 1, 64, 800, 800, 96, 96] + - [38, 1097.98] + - - [768, 128, 1, 128, 800, 800, 160, 160] + - [38, 2014.55] + - - [768, 128, 1, 256, 800, 800, 288, 288] + - [6, 3333.22] + - - [768, 128, 1, 512, 800, 800, 544, 544] + - [33, 5123.86] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [43, 6740.98] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [6, 7982.49] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [6, 8686.1] + - - [768, 256, 1, 64, 800, 800, 96, 96] + - [49, 2159.42] + - - [768, 256, 1, 128, 800, 800, 160, 160] + - [7, 3877.02] + - - [768, 256, 1, 256, 800, 800, 288, 288] + - [43, 6413.31] + - - [768, 256, 1, 512, 800, 800, 544, 544] + - [2, 9668.01] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [43, 12891.5] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [22, 15527.9] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [24, 17518.4] + - - [768, 512, 1, 64, 800, 800, 96, 96] + - [21, 3868.69] + - - [768, 512, 1, 128, 800, 800, 160, 160] + - [39, 6782.33] + - - [768, 512, 1, 256, 800, 800, 288, 288] + - [2, 10931.0] + - - [768, 512, 1, 512, 800, 800, 544, 544] + - [15, 15464.0] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [4, 20500.6] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [18, 23934.6] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [28, 26178.2] + - - [768, 1024, 1, 64, 800, 800, 96, 96] + - [21, 6325.44] + - - [768, 1024, 1, 128, 800, 800, 160, 160] + - [22, 11844.1] + - - [768, 1024, 1, 256, 800, 800, 288, 288] + - [22, 17949.9] + - - [768, 1024, 1, 512, 800, 800, 544, 544] + - [24, 24447.6] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [6, 28847.5] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [43, 30748.0] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [51, 33788.1] + - - [768, 2048, 1, 64, 800, 800, 96, 96] + - [40, 11795.5] + - - [768, 2048, 1, 128, 800, 800, 160, 160] + - [16, 17724.0] + - - [768, 2048, 1, 256, 800, 800, 288, 288] + - [44, 24016.0] + - - [768, 2048, 1, 512, 800, 800, 544, 544] + - [43, 29851.6] + - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] + - [43, 32529.0] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [51, 35945.9] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [51, 37562.2] + - - [768, 4096, 1, 64, 800, 800, 96, 96] + - [21, 15401.3] + - - [768, 4096, 1, 128, 800, 800, 160, 160] + - [25, 21978.9] + - - [768, 4096, 1, 256, 800, 800, 288, 288] + - [44, 28088.8] + - - [768, 4096, 1, 512, 800, 800, 544, 544] + - [24, 31423.5] + - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] + - [43, 35456.1] + - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] + - [51, 37685.0] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [51, 36951.0] + - - [1536, 64, 1, 64, 1568, 1568, 96, 96] + - [3, 1000.39] + - - [1536, 64, 1, 128, 1568, 1568, 160, 160] + - [0, 1845.0] + - - [1536, 64, 1, 256, 1568, 1568, 288, 288] + - [6, 2970.46] + - - [1536, 64, 1, 512, 1568, 1568, 544, 544] + - [43, 4435.67] + - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] + - [4, 6060.39] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [24, 7662.28] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [24, 8411.21] + - - [1536, 128, 1, 64, 1568, 1568, 96, 96] + - [49, 2077.76] + - - [1536, 128, 1, 128, 1568, 1568, 160, 160] + - [3, 3829.23] + - - [1536, 128, 1, 256, 1568, 1568, 288, 288] + - [25, 6363.82] + - - [1536, 128, 1, 512, 1568, 1568, 544, 544] + - [43, 9573.28] + - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] + - [6, 12858.5] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [43, 15697.3] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [29, 17128.0] + - - [1536, 256, 1, 64, 1568, 1568, 96, 96] + - [40, 3840.94] + - - [1536, 256, 1, 128, 1568, 1568, 160, 160] + - [49, 6720.72] + - - [1536, 256, 1, 256, 1568, 1568, 288, 288] + - [46, 10829.8] + - - [1536, 256, 1, 512, 1568, 1568, 544, 544] + - [27, 15680.9] + - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] + - [18, 20705.1] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [27, 24112.4] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [14, 25905.7] + - - [1536, 512, 1, 64, 1568, 1568, 96, 96] + - [21, 6168.85] + - - [1536, 512, 1, 128, 1568, 1568, 160, 160] + - [16, 11549.2] + - - [1536, 512, 1, 256, 1568, 1568, 288, 288] + - [33, 18740.3] + - - [1536, 512, 1, 512, 1568, 1568, 544, 544] + - [24, 24995.5] + - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] + - [43, 29613.4] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [43, 31179.0] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [51, 33259.6] + - - [1536, 1024, 1, 64, 1568, 1568, 96, 96] + - [49, 11812.2] + - - [1536, 1024, 1, 128, 1568, 1568, 160, 160] + - [44, 18093.5] + - - [1536, 1024, 1, 256, 1568, 1568, 288, 288] + - [44, 24682.9] + - - [1536, 1024, 1, 512, 1568, 1568, 544, 544] + - [7, 29987.2] + - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] + - [43, 32734.0] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [51, 35922.7] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [1, 37829.3] + - - [1536, 2048, 1, 64, 1568, 1568, 96, 96] + - [40, 15348.5] + - - [1536, 2048, 1, 128, 1568, 1568, 160, 160] + - [44, 22023.3] + - - [1536, 2048, 1, 256, 1568, 1568, 288, 288] + - [44, 28212.8] + - - [1536, 2048, 1, 512, 1568, 1568, 544, 544] + - [51, 31322.6] + - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 1056] + - [51, 35312.3] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [43, 37702.9] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [24, 37668.8] + - - [1536, 4096, 1, 64, 1568, 1568, 96, 96] + - [40, 17719.3] + - - [1536, 4096, 1, 128, 1568, 1568, 160, 160] + - [44, 24513.9] + - - [1536, 4096, 1, 256, 1568, 1568, 288, 288] + - [52, 29207.9] + - - [1536, 4096, 1, 512, 1568, 1568, 544, 544] + - [43, 33728.3] + - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 1056] + - [51, 36909.3] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] + - [51, 37902.8] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [24, 37593.2] + - - [3072, 64, 1, 64, 3104, 3104, 96, 96] + - [11, 1867.46] + - - [3072, 64, 1, 128, 3104, 3104, 160, 160] + - [36, 3002.71] + - - [3072, 64, 1, 256, 3104, 3104, 288, 288] + - [9, 4961.21] + - - [3072, 64, 1, 512, 3104, 3104, 544, 544] + - [53, 7396.27] + - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] + - [53, 9896.1] + - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] + - [27, 11820.8] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [28, 13089.7] + - - [3072, 128, 1, 64, 3104, 3104, 96, 96] + - [34, 3649.87] + - - [3072, 128, 1, 128, 3104, 3104, 160, 160] + - [30, 6573.29] + - - [3072, 128, 1, 256, 3104, 3104, 288, 288] + - [9, 10606.2] + - - [3072, 128, 1, 512, 3104, 3104, 544, 544] + - [9, 15256.6] + - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] + - [27, 20513.2] + - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] + - [36, 24127.5] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [5, 26725.0] + - - [3072, 256, 1, 64, 3104, 3104, 96, 96] + - [35, 6018.37] + - - [3072, 256, 1, 128, 3104, 3104, 160, 160] + - [40, 11286.4] + - - [3072, 256, 1, 256, 3104, 3104, 288, 288] + - [44, 17635.4] + - - [3072, 256, 1, 512, 3104, 3104, 544, 544] + - [25, 22995.6] + - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] + - [44, 27061.8] + - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] + - [40, 29256.7] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [30, 30967.9] + - - [3072, 512, 1, 64, 3104, 3104, 96, 96] + - [21, 11781.8] + - - [3072, 512, 1, 128, 3104, 3104, 160, 160] + - [52, 18002.9] + - - [3072, 512, 1, 256, 3104, 3104, 288, 288] + - [25, 24743.6] + - - [3072, 512, 1, 512, 3104, 3104, 544, 544] + - [25, 30190.7] + - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] + - [25, 32462.9] + - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] + - [10, 35969.2] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [14, 37823.8] + - - [3072, 1024, 1, 64, 3104, 3104, 96, 96] + - [40, 14991.9] + - - [3072, 1024, 1, 128, 3104, 3104, 160, 160] + - [52, 21712.2] + - - [3072, 1024, 1, 256, 3104, 3104, 288, 288] + - [44, 27782.6] + - - [3072, 1024, 1, 512, 3104, 3104, 544, 544] + - [25, 31159.7] + - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] + - [44, 34853.0] + - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] + - [14, 37719.5] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [14, 36563.5] + - - [3072, 2048, 1, 64, 3104, 3104, 96, 96] + - [40, 17801.5] + - - [3072, 2048, 1, 128, 3104, 3104, 160, 160] + - [52, 24795.4] + - - [3072, 2048, 1, 256, 3104, 3104, 288, 288] + - [44, 29186.2] + - - [3072, 2048, 1, 512, 3104, 3104, 544, 544] + - [25, 33497.5] + - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 1056] + - [33, 36596.1] + - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] + - [14, 37875.8] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [13, 36978.7] + - - [3072, 4096, 1, 64, 3104, 3104, 96, 96] + - [40, 18774.3] + - - [3072, 4096, 1, 128, 3104, 3104, 160, 160] + - [52, 25527.6] + - - [3072, 4096, 1, 256, 3104, 3104, 288, 288] + - [52, 30859.6] + - - [3072, 4096, 1, 512, 3104, 3104, 544, 544] + - [51, 34833.4] + - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 1056] + - [51, 36900.4] + - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 2080] + - [33, 37626.6] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [15, 36225.5] + - - [4096, 64, 1, 64, 4128, 4128, 96, 96] + - [26, 2086.46] + - - [4096, 64, 1, 128, 4128, 4128, 160, 160] + - [46, 3837.85] + - - [4096, 64, 1, 256, 4128, 4128, 288, 288] + - [27, 6367.65] + - - [4096, 64, 1, 512, 4128, 4128, 544, 544] + - [18, 9467.95] + - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] + - [36, 12858.6] + - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] + - [53, 15583.1] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [18, 16485.6] + - - [4096, 128, 1, 64, 4128, 4128, 96, 96] + - [52, 5803.26] + - - [4096, 128, 1, 128, 4128, 4128, 160, 160] + - [15, 10105.2] + - - [4096, 128, 1, 256, 4128, 4128, 288, 288] + - [7, 15846.2] + - - [4096, 128, 1, 512, 4128, 4128, 544, 544] + - [15, 22586.0] + - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] + - [24, 28587.3] + - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] + - [1, 33047.3] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [36, 33095.7] + - - [4096, 256, 1, 64, 4128, 4128, 96, 96] + - [40, 9209.4] + - - [4096, 256, 1, 128, 4128, 4128, 160, 160] + - [34, 14999.7] + - - [4096, 256, 1, 256, 4128, 4128, 288, 288] + - [34, 21230.3] + - - [4096, 256, 1, 512, 4128, 4128, 544, 544] + - [44, 27352.3] + - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] + - [25, 31965.1] + - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] + - [1, 34747.8] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [9, 34730.1] + - - [4096, 512, 1, 64, 4128, 4128, 96, 96] + - [49, 12811.9] + - - [4096, 512, 1, 128, 4128, 4128, 160, 160] + - [52, 19768.4] + - - [4096, 512, 1, 256, 4128, 4128, 288, 288] + - [25, 26186.3] + - - [4096, 512, 1, 512, 4128, 4128, 544, 544] + - [44, 31344.6] + - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] + - [25, 33708.2] + - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] + - [14, 36885.0] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [27, 35926.8] + - - [4096, 1024, 1, 64, 4128, 4128, 96, 96] + - [40, 16272.8] + - - [4096, 1024, 1, 128, 4128, 4128, 160, 160] + - [44, 22970.7] + - - [4096, 1024, 1, 256, 4128, 4128, 288, 288] + - [32, 28494.8] + - - [4096, 1024, 1, 512, 4128, 4128, 544, 544] + - [44, 32270.7] + - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] + - [43, 35694.7] + - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] + - [14, 38373.3] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [43, 35849.0] + - - [4096, 2048, 1, 64, 4128, 4128, 96, 96] + - [40, 18733.1] + - - [4096, 2048, 1, 128, 4128, 4128, 160, 160] + - [44, 24530.9] + - - [4096, 2048, 1, 256, 4128, 4128, 288, 288] + - [44, 29917.6] + - - [4096, 2048, 1, 512, 4128, 4128, 544, 544] + - [43, 34241.9] + - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 1056] + - [51, 37115.0] + - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] + - [24, 37885.1] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [27, 36307.3] + - - [4096, 4096, 1, 64, 4128, 4128, 96, 96] + - [38, 17609.0] + - - [4096, 4096, 1, 128, 4128, 4128, 160, 160] + - [28, 23294.6] + - - [4096, 4096, 1, 256, 4128, 4128, 288, 288] + - [25, 30255.6] + - - [4096, 4096, 1, 512, 4128, 4128, 544, 544] + - [43, 34915.7] + - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 1056] + - [51, 37121.9] + - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 2080] + - [33, 38048.4] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [46, 35769.5] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_HHS_BH_GB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_HHS_BH_GB.yaml new file mode 100644 index 00000000000..29afc33d19f --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_HHS_BH_GB.yaml @@ -0,0 +1,15423 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW4_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS0_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 8 + LVCB: 8 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW4_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25600 + LdsNumElementsAlignedA: 4608 + LdsNumElementsAlignedB: 4608 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4608 + LdsOffsetB_Blk: 20992 + LdsPadA: 8 + LdsPadB: 8 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 4 + DestDataType: 4 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_MT128x128x32_MI16x16x16x1_SN_EPS1_GRVW8_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 2 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 2 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [8, 36.7457] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [20, 68.6421] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [29, 111.931] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [39, 175.083] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [39, 247.116] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [2, 310.505] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [2, 340.267] + - - [64, 128, 1, 64, 96, 96, 96, 96] + - [40, 65.5114] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [21, 121.913] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [3, 220.312] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [39, 349.234] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [6, 491.856] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [24, 618.149] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [33, 679.307] + - - [64, 256, 1, 64, 96, 96, 96, 96] + - [21, 151.99] + - - [64, 256, 1, 128, 96, 96, 160, 160] + - [6, 279.582] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [6, 479.567] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [2, 745.52] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [39, 1027.01] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [39, 1260.26] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [43, 1410.03] + - - [64, 512, 1, 64, 96, 96, 96, 96] + - [12, 305.795] + - - [64, 512, 1, 128, 96, 96, 160, 160] + - [20, 562.013] + - - [64, 512, 1, 256, 96, 96, 288, 288] + - [39, 962.548] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [2, 1496.89] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [2, 2078.96] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [24, 2569.05] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [24, 2891.68] + - - [64, 1024, 1, 64, 96, 96, 96, 96] + - [20, 622.3] + - - [64, 1024, 1, 128, 96, 96, 160, 160] + - [48, 1147.71] + - - [64, 1024, 1, 256, 96, 96, 288, 288] + - [20, 1956.06] + - - [64, 1024, 1, 512, 96, 96, 544, 544] + - [31, 3043.76] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [51, 4295.79] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [41, 5258.89] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [11, 5969.6] + - - [64, 2048, 1, 64, 96, 96, 96, 96] + - [20, 1227.66] + - - [64, 2048, 1, 128, 96, 96, 160, 160] + - [39, 2253.48] + - - [64, 2048, 1, 256, 96, 96, 288, 288] + - [37, 3857.27] + - - [64, 2048, 1, 512, 96, 96, 544, 544] + - [19, 6011.17] + - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] + - [4, 8396.99] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [51, 10595.4] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [39, 12061.8] + - - [64, 4096, 1, 64, 96, 96, 96, 96] + - [26, 2226.57] + - - [64, 4096, 1, 128, 96, 96, 160, 160] + - [27, 3942.02] + - - [64, 4096, 1, 256, 96, 96, 288, 288] + - [9, 6505.95] + - - [64, 4096, 1, 512, 96, 96, 544, 544] + - [4, 9536.56] + - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] + - [33, 12933.5] + - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] + - [33, 15681.0] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [46, 16253.5] + - - [128, 64, 1, 64, 160, 160, 96, 96] + - [7, 73.833] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [20, 135.633] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [0, 233.509] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [20, 344.955] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [24, 486.267] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [6, 611.525] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [15, 666.807] + - - [128, 128, 1, 64, 160, 160, 96, 96] + - [21, 178.603] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [7, 331.723] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [2, 547.557] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [6, 824.35] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [43, 1096.84] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [51, 1317.2] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [43, 1409.91] + - - [128, 256, 1, 64, 160, 160, 96, 96] + - [47, 368.437] + - - [128, 256, 1, 128, 160, 160, 160, 160] + - [3, 662.189] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [39, 1096.26] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [43, 1651.95] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [24, 2200.28] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [6, 2639.9] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [6, 2887.08] + - - [128, 512, 1, 64, 160, 160, 96, 96] + - [7, 798.611] + - - [128, 512, 1, 128, 160, 160, 160, 160] + - [23, 1415.08] + - - [128, 512, 1, 256, 160, 160, 288, 288] + - [43, 2341.22] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [20, 3489.44] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [43, 4564.91] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [6, 5405.67] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [33, 5917.36] + - - [128, 1024, 1, 64, 160, 160, 96, 96] + - [52, 1583.05] + - - [128, 1024, 1, 128, 160, 160, 160, 160] + - [7, 2834.47] + - - [128, 1024, 1, 256, 160, 160, 288, 288] + - [29, 4702.13] + - - [128, 1024, 1, 512, 160, 160, 544, 544] + - [29, 6981.04] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [51, 9150.99] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [20, 10891.2] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [43, 12325.1] + - - [128, 2048, 1, 64, 160, 160, 96, 96] + - [44, 2882.68] + - - [128, 2048, 1, 128, 160, 160, 160, 160] + - [40, 5170.97] + - - [128, 2048, 1, 256, 160, 160, 288, 288] + - [39, 8593.78] + - - [128, 2048, 1, 512, 160, 160, 544, 544] + - [51, 12941.6] + - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] + - [20, 17751.3] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [2, 21555.8] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [15, 24389.9] + - - [128, 4096, 1, 64, 160, 160, 96, 96] + - [9, 4599.63] + - - [128, 4096, 1, 128, 160, 160, 160, 160] + - [45, 8503.38] + - - [128, 4096, 1, 256, 160, 160, 288, 288] + - [18, 13859.7] + - - [128, 4096, 1, 512, 160, 160, 544, 544] + - [29, 20533.5] + - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] + - [15, 27450.2] + - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] + - [6, 32243.5] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [46, 32669.2] + - - [256, 64, 1, 64, 288, 288, 96, 96] + - [21, 159.99] + - - [256, 64, 1, 128, 288, 288, 160, 160] + - [2, 296.543] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [20, 485.675] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [6, 717.832] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [43, 1018.84] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [24, 1259.41] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [6, 1335.98] + - - [256, 128, 1, 64, 288, 288, 96, 96] + - [52, 376.508] + - - [256, 128, 1, 128, 288, 288, 160, 160] + - [7, 693.847] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [20, 1141.15] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [6, 1696.21] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [6, 2236.21] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [6, 2657.77] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [43, 2900.5] + - - [256, 256, 1, 64, 288, 288, 96, 96] + - [16, 795.427] + - - [256, 256, 1, 128, 288, 288, 160, 160] + - [5, 1415.8] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [43, 2329.51] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [2, 3449.25] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [33, 4417.96] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [6, 5296.04] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [6, 5854.77] + - - [256, 512, 1, 64, 288, 288, 96, 96] + - [21, 1523.81] + - - [256, 512, 1, 128, 288, 288, 160, 160] + - [25, 2709.06] + - - [256, 512, 1, 256, 288, 288, 288, 288] + - [39, 4698.83] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [24, 6919.13] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [51, 9062.03] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [24, 10725.4] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [6, 11877.7] + - - [256, 1024, 1, 64, 288, 288, 96, 96] + - [47, 2951.14] + - - [256, 1024, 1, 128, 288, 288, 160, 160] + - [25, 5266.74] + - - [256, 1024, 1, 256, 288, 288, 288, 288] + - [20, 8731.31] + - - [256, 1024, 1, 512, 288, 288, 544, 544] + - [43, 13153.4] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [24, 17491.0] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [15, 21173.3] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [15, 23730.1] + - - [256, 2048, 1, 64, 288, 288, 96, 96] + - [40, 4922.17] + - - [256, 2048, 1, 128, 288, 288, 160, 160] + - [42, 8645.79] + - - [256, 2048, 1, 256, 288, 288, 288, 288] + - [24, 13915.8] + - - [256, 2048, 1, 512, 288, 288, 544, 544] + - [39, 20629.8] + - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] + - [0, 27165.4] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [6, 31676.6] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [24, 33904.5] + - - [256, 4096, 1, 64, 288, 288, 96, 96] + - [17, 8928.77] + - - [256, 4096, 1, 128, 288, 288, 160, 160] + - [25, 14966.3] + - - [256, 4096, 1, 256, 288, 288, 288, 288] + - [7, 21681.2] + - - [256, 4096, 1, 512, 288, 288, 544, 544] + - [6, 28146.7] + - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] + - [6, 32918.6] + - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] + - [43, 34272.5] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [53, 34792.5] + - - [384, 64, 1, 64, 416, 416, 96, 96] + - [11, 255.667] + - - [384, 64, 1, 128, 416, 416, 160, 160] + - [2, 470.847] + - - [384, 64, 1, 256, 416, 416, 288, 288] + - [20, 765.847] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [48, 1123.98] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [48, 1548.09] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [15, 1901.1] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [33, 2091.7] + - - [384, 128, 1, 64, 416, 416, 96, 96] + - [49, 547.464] + - - [384, 128, 1, 128, 416, 416, 160, 160] + - [40, 1046.31] + - - [384, 128, 1, 256, 416, 416, 288, 288] + - [3, 1728.42] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [24, 2559.32] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [51, 3348.52] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [24, 4000.6] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [24, 4340.61] + - - [384, 256, 1, 64, 416, 416, 96, 96] + - [21, 1145.78] + - - [384, 256, 1, 128, 416, 416, 160, 160] + - [44, 2036.72] + - - [384, 256, 1, 256, 416, 416, 288, 288] + - [2, 3352.31] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [43, 5011.6] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [6, 6638.73] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [43, 7912.21] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [43, 8765.9] + - - [384, 512, 1, 64, 416, 416, 96, 96] + - [21, 2307.94] + - - [384, 512, 1, 128, 416, 416, 160, 160] + - [7, 4084.68] + - - [384, 512, 1, 256, 416, 416, 288, 288] + - [0, 6471.84] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [6, 9722.14] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [24, 12959.5] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [24, 15626.1] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [43, 17407.9] + - - [384, 1024, 1, 64, 416, 416, 96, 96] + - [40, 3699.75] + - - [384, 1024, 1, 128, 416, 416, 160, 160] + - [53, 6651.44] + - - [384, 1024, 1, 256, 416, 416, 288, 288] + - [4, 10926.2] + - - [384, 1024, 1, 512, 416, 416, 544, 544] + - [6, 15931.5] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [6, 20872.6] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [9, 24281.0] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [43, 25732.7] + - - [384, 2048, 1, 64, 416, 416, 96, 96] + - [49, 6064.77] + - - [384, 2048, 1, 128, 416, 416, 160, 160] + - [33, 11356.4] + - - [384, 2048, 1, 256, 416, 416, 288, 288] + - [37, 18214.6] + - - [384, 2048, 1, 512, 416, 416, 544, 544] + - [51, 24514.6] + - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] + - [43, 29736.9] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [6, 31446.8] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [51, 33687.0] + - - [384, 4096, 1, 64, 416, 416, 96, 96] + - [52, 10806.6] + - - [384, 4096, 1, 128, 416, 416, 160, 160] + - [44, 16615.2] + - - [384, 4096, 1, 256, 416, 416, 288, 288] + - [44, 22539.9] + - - [384, 4096, 1, 512, 416, 416, 544, 544] + - [44, 27574.2] + - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] + - [25, 29728.7] + - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] + - [50, 32596.5] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [51, 32614.7] + - - [768, 64, 1, 64, 800, 800, 96, 96] + - [30, 499.877] + - - [768, 64, 1, 128, 800, 800, 160, 160] + - [12, 923.313] + - - [768, 64, 1, 256, 800, 800, 288, 288] + - [13, 1497.78] + - - [768, 64, 1, 512, 800, 800, 544, 544] + - [24, 2229.43] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [6, 3067.31] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [43, 3766.63] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [19, 4000.92] + - - [768, 128, 1, 64, 800, 800, 96, 96] + - [38, 1097.98] + - - [768, 128, 1, 128, 800, 800, 160, 160] + - [38, 2014.55] + - - [768, 128, 1, 256, 800, 800, 288, 288] + - [6, 3333.22] + - - [768, 128, 1, 512, 800, 800, 544, 544] + - [33, 5123.86] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [43, 6740.98] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [6, 7982.49] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [6, 8686.1] + - - [768, 256, 1, 64, 800, 800, 96, 96] + - [49, 2159.42] + - - [768, 256, 1, 128, 800, 800, 160, 160] + - [7, 3877.02] + - - [768, 256, 1, 256, 800, 800, 288, 288] + - [43, 6413.31] + - - [768, 256, 1, 512, 800, 800, 544, 544] + - [2, 9668.01] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [43, 12891.5] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [22, 15527.9] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [24, 17518.4] + - - [768, 512, 1, 64, 800, 800, 96, 96] + - [21, 3868.69] + - - [768, 512, 1, 128, 800, 800, 160, 160] + - [39, 6782.33] + - - [768, 512, 1, 256, 800, 800, 288, 288] + - [2, 10931.0] + - - [768, 512, 1, 512, 800, 800, 544, 544] + - [15, 15464.0] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [4, 20500.6] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [18, 23934.6] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [28, 26178.2] + - - [768, 1024, 1, 64, 800, 800, 96, 96] + - [21, 6325.44] + - - [768, 1024, 1, 128, 800, 800, 160, 160] + - [22, 11844.1] + - - [768, 1024, 1, 256, 800, 800, 288, 288] + - [22, 17949.9] + - - [768, 1024, 1, 512, 800, 800, 544, 544] + - [24, 24447.6] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [6, 28847.5] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [43, 30748.0] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [51, 33788.1] + - - [768, 2048, 1, 64, 800, 800, 96, 96] + - [40, 11795.5] + - - [768, 2048, 1, 128, 800, 800, 160, 160] + - [16, 17724.0] + - - [768, 2048, 1, 256, 800, 800, 288, 288] + - [44, 24016.0] + - - [768, 2048, 1, 512, 800, 800, 544, 544] + - [43, 29851.6] + - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] + - [43, 32529.0] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [51, 35945.9] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [51, 37562.2] + - - [768, 4096, 1, 64, 800, 800, 96, 96] + - [21, 15401.3] + - - [768, 4096, 1, 128, 800, 800, 160, 160] + - [25, 21978.9] + - - [768, 4096, 1, 256, 800, 800, 288, 288] + - [44, 28088.8] + - - [768, 4096, 1, 512, 800, 800, 544, 544] + - [24, 31423.5] + - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] + - [43, 35456.1] + - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] + - [51, 37685.0] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [51, 36951.0] + - - [1536, 64, 1, 64, 1568, 1568, 96, 96] + - [3, 1000.39] + - - [1536, 64, 1, 128, 1568, 1568, 160, 160] + - [0, 1845.0] + - - [1536, 64, 1, 256, 1568, 1568, 288, 288] + - [6, 2970.46] + - - [1536, 64, 1, 512, 1568, 1568, 544, 544] + - [43, 4435.67] + - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] + - [4, 6060.39] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [24, 7662.28] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [24, 8411.21] + - - [1536, 128, 1, 64, 1568, 1568, 96, 96] + - [49, 2077.76] + - - [1536, 128, 1, 128, 1568, 1568, 160, 160] + - [3, 3829.23] + - - [1536, 128, 1, 256, 1568, 1568, 288, 288] + - [25, 6363.82] + - - [1536, 128, 1, 512, 1568, 1568, 544, 544] + - [43, 9573.28] + - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] + - [6, 12858.5] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [43, 15697.3] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [29, 17128.0] + - - [1536, 256, 1, 64, 1568, 1568, 96, 96] + - [40, 3840.94] + - - [1536, 256, 1, 128, 1568, 1568, 160, 160] + - [49, 6720.72] + - - [1536, 256, 1, 256, 1568, 1568, 288, 288] + - [46, 10829.8] + - - [1536, 256, 1, 512, 1568, 1568, 544, 544] + - [27, 15680.9] + - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] + - [18, 20705.1] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [27, 24112.4] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [14, 25905.7] + - - [1536, 512, 1, 64, 1568, 1568, 96, 96] + - [21, 6168.85] + - - [1536, 512, 1, 128, 1568, 1568, 160, 160] + - [16, 11549.2] + - - [1536, 512, 1, 256, 1568, 1568, 288, 288] + - [33, 18740.3] + - - [1536, 512, 1, 512, 1568, 1568, 544, 544] + - [24, 24995.5] + - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] + - [43, 29613.4] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [43, 31179.0] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [51, 33259.6] + - - [1536, 1024, 1, 64, 1568, 1568, 96, 96] + - [49, 11812.2] + - - [1536, 1024, 1, 128, 1568, 1568, 160, 160] + - [44, 18093.5] + - - [1536, 1024, 1, 256, 1568, 1568, 288, 288] + - [44, 24682.9] + - - [1536, 1024, 1, 512, 1568, 1568, 544, 544] + - [7, 29987.2] + - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] + - [43, 32734.0] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [51, 35922.7] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [1, 37829.3] + - - [1536, 2048, 1, 64, 1568, 1568, 96, 96] + - [40, 15348.5] + - - [1536, 2048, 1, 128, 1568, 1568, 160, 160] + - [44, 22023.3] + - - [1536, 2048, 1, 256, 1568, 1568, 288, 288] + - [44, 28212.8] + - - [1536, 2048, 1, 512, 1568, 1568, 544, 544] + - [51, 31322.6] + - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 1056] + - [51, 35312.3] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [43, 37702.9] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [24, 37668.8] + - - [1536, 4096, 1, 64, 1568, 1568, 96, 96] + - [40, 17719.3] + - - [1536, 4096, 1, 128, 1568, 1568, 160, 160] + - [44, 24513.9] + - - [1536, 4096, 1, 256, 1568, 1568, 288, 288] + - [52, 29207.9] + - - [1536, 4096, 1, 512, 1568, 1568, 544, 544] + - [43, 33728.3] + - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 1056] + - [51, 36909.3] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] + - [51, 37902.8] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [24, 37593.2] + - - [3072, 64, 1, 64, 3104, 3104, 96, 96] + - [11, 1867.46] + - - [3072, 64, 1, 128, 3104, 3104, 160, 160] + - [36, 3002.71] + - - [3072, 64, 1, 256, 3104, 3104, 288, 288] + - [9, 4961.21] + - - [3072, 64, 1, 512, 3104, 3104, 544, 544] + - [53, 7396.27] + - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] + - [53, 9896.1] + - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] + - [27, 11820.8] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [28, 13089.7] + - - [3072, 128, 1, 64, 3104, 3104, 96, 96] + - [34, 3649.87] + - - [3072, 128, 1, 128, 3104, 3104, 160, 160] + - [30, 6573.29] + - - [3072, 128, 1, 256, 3104, 3104, 288, 288] + - [9, 10606.2] + - - [3072, 128, 1, 512, 3104, 3104, 544, 544] + - [9, 15256.6] + - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] + - [27, 20513.2] + - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] + - [36, 24127.5] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [5, 26725.0] + - - [3072, 256, 1, 64, 3104, 3104, 96, 96] + - [35, 6018.37] + - - [3072, 256, 1, 128, 3104, 3104, 160, 160] + - [40, 11286.4] + - - [3072, 256, 1, 256, 3104, 3104, 288, 288] + - [44, 17635.4] + - - [3072, 256, 1, 512, 3104, 3104, 544, 544] + - [25, 22995.6] + - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] + - [44, 27061.8] + - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] + - [40, 29256.7] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [30, 30967.9] + - - [3072, 512, 1, 64, 3104, 3104, 96, 96] + - [21, 11781.8] + - - [3072, 512, 1, 128, 3104, 3104, 160, 160] + - [52, 18002.9] + - - [3072, 512, 1, 256, 3104, 3104, 288, 288] + - [25, 24743.6] + - - [3072, 512, 1, 512, 3104, 3104, 544, 544] + - [25, 30190.7] + - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] + - [25, 32462.9] + - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] + - [10, 35969.2] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [14, 37823.8] + - - [3072, 1024, 1, 64, 3104, 3104, 96, 96] + - [40, 14991.9] + - - [3072, 1024, 1, 128, 3104, 3104, 160, 160] + - [52, 21712.2] + - - [3072, 1024, 1, 256, 3104, 3104, 288, 288] + - [44, 27782.6] + - - [3072, 1024, 1, 512, 3104, 3104, 544, 544] + - [25, 31159.7] + - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] + - [44, 34853.0] + - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] + - [14, 37719.5] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [14, 36563.5] + - - [3072, 2048, 1, 64, 3104, 3104, 96, 96] + - [40, 17801.5] + - - [3072, 2048, 1, 128, 3104, 3104, 160, 160] + - [52, 24795.4] + - - [3072, 2048, 1, 256, 3104, 3104, 288, 288] + - [44, 29186.2] + - - [3072, 2048, 1, 512, 3104, 3104, 544, 544] + - [25, 33497.5] + - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 1056] + - [33, 36596.1] + - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] + - [14, 37875.8] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [13, 36978.7] + - - [3072, 4096, 1, 64, 3104, 3104, 96, 96] + - [40, 18774.3] + - - [3072, 4096, 1, 128, 3104, 3104, 160, 160] + - [52, 25527.6] + - - [3072, 4096, 1, 256, 3104, 3104, 288, 288] + - [52, 30859.6] + - - [3072, 4096, 1, 512, 3104, 3104, 544, 544] + - [51, 34833.4] + - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 1056] + - [51, 36900.4] + - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 2080] + - [33, 37626.6] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [15, 36225.5] + - - [4096, 64, 1, 64, 4128, 4128, 96, 96] + - [26, 2086.46] + - - [4096, 64, 1, 128, 4128, 4128, 160, 160] + - [46, 3837.85] + - - [4096, 64, 1, 256, 4128, 4128, 288, 288] + - [27, 6367.65] + - - [4096, 64, 1, 512, 4128, 4128, 544, 544] + - [18, 9467.95] + - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] + - [36, 12858.6] + - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] + - [53, 15583.1] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [18, 16485.6] + - - [4096, 128, 1, 64, 4128, 4128, 96, 96] + - [52, 5803.26] + - - [4096, 128, 1, 128, 4128, 4128, 160, 160] + - [15, 10105.2] + - - [4096, 128, 1, 256, 4128, 4128, 288, 288] + - [7, 15846.2] + - - [4096, 128, 1, 512, 4128, 4128, 544, 544] + - [15, 22586.0] + - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] + - [24, 28587.3] + - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] + - [1, 33047.3] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [36, 33095.7] + - - [4096, 256, 1, 64, 4128, 4128, 96, 96] + - [40, 9209.4] + - - [4096, 256, 1, 128, 4128, 4128, 160, 160] + - [34, 14999.7] + - - [4096, 256, 1, 256, 4128, 4128, 288, 288] + - [34, 21230.3] + - - [4096, 256, 1, 512, 4128, 4128, 544, 544] + - [44, 27352.3] + - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] + - [25, 31965.1] + - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] + - [1, 34747.8] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [9, 34730.1] + - - [4096, 512, 1, 64, 4128, 4128, 96, 96] + - [49, 12811.9] + - - [4096, 512, 1, 128, 4128, 4128, 160, 160] + - [52, 19768.4] + - - [4096, 512, 1, 256, 4128, 4128, 288, 288] + - [25, 26186.3] + - - [4096, 512, 1, 512, 4128, 4128, 544, 544] + - [44, 31344.6] + - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] + - [25, 33708.2] + - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] + - [14, 36885.0] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [27, 35926.8] + - - [4096, 1024, 1, 64, 4128, 4128, 96, 96] + - [40, 16272.8] + - - [4096, 1024, 1, 128, 4128, 4128, 160, 160] + - [44, 22970.7] + - - [4096, 1024, 1, 256, 4128, 4128, 288, 288] + - [32, 28494.8] + - - [4096, 1024, 1, 512, 4128, 4128, 544, 544] + - [44, 32270.7] + - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] + - [43, 35694.7] + - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] + - [14, 38373.3] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [43, 35849.0] + - - [4096, 2048, 1, 64, 4128, 4128, 96, 96] + - [40, 18733.1] + - - [4096, 2048, 1, 128, 4128, 4128, 160, 160] + - [44, 24530.9] + - - [4096, 2048, 1, 256, 4128, 4128, 288, 288] + - [44, 29917.6] + - - [4096, 2048, 1, 512, 4128, 4128, 544, 544] + - [43, 34241.9] + - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 1056] + - [51, 37115.0] + - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] + - [24, 37885.1] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [27, 36307.3] + - - [4096, 4096, 1, 64, 4128, 4128, 96, 96] + - [38, 17609.0] + - - [4096, 4096, 1, 128, 4128, 4128, 160, 160] + - [28, 23294.6] + - - [4096, 4096, 1, 256, 4128, 4128, 288, 288] + - [25, 30255.6] + - - [4096, 4096, 1, 512, 4128, 4128, 544, 544] + - [43, 34915.7] + - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 1056] + - [51, 37121.9] + - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 2080] + - [33, 38048.4] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [46, 35769.5] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_I8II_BH.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_I8II_BH.yaml new file mode 100644 index 00000000000..c7895241392 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_I8II_BH.yaml @@ -0,0 +1,22173 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25856 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 20736 + LdsPadA: 8 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA8_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25856 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 20736 + LdsPadA: 8 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA8_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25856 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 20736 + LdsPadA: 8 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA8_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 10240 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_LPB32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25856 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 20736 + LdsPadA: 8 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA8_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [2, 38.5506] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [30, 68.5794] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [5, 118.685] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [55, 191.102] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [24, 266.865] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [28, 339.675] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [14, 395.96] + - - [64, 128, 1, 64, 96, 96, 96, 96] + - [21, 68.7323] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [55, 130.794] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [21, 228.722] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [55, 380.85] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [37, 535.74] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [30, 685.317] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [37, 796.167] + - - [64, 256, 1, 64, 96, 96, 96, 96] + - [68, 161.892] + - - [64, 256, 1, 128, 96, 96, 160, 160] + - [7, 302.054] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [66, 523.176] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [30, 818.804] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [4, 1112.33] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [78, 1401.66] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [63, 1624.99] + - - [64, 512, 1, 64, 96, 96, 96, 96] + - [68, 345.665] + - - [64, 512, 1, 128, 96, 96, 160, 160] + - [29, 642.313] + - - [64, 512, 1, 256, 96, 96, 288, 288] + - [66, 1102.17] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [20, 1707.43] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [24, 2277.35] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [48, 2887.53] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [36, 3258.11] + - - [64, 1024, 1, 64, 96, 96, 96, 96] + - [68, 672.056] + - - [64, 1024, 1, 128, 96, 96, 160, 160] + - [20, 1313.18] + - - [64, 1024, 1, 256, 96, 96, 288, 288] + - [27, 2234.88] + - - [64, 1024, 1, 512, 96, 96, 544, 544] + - [68, 3440.08] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [4, 4682.47] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [13, 5773.56] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [13, 6533.51] + - - [64, 2048, 1, 64, 96, 96, 96, 96] + - [27, 1302.98] + - - [64, 2048, 1, 128, 96, 96, 160, 160] + - [54, 2319.22] + - - [64, 2048, 1, 256, 96, 96, 288, 288] + - [43, 3986.52] + - - [64, 2048, 1, 512, 96, 96, 544, 544] + - [27, 6463.34] + - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] + - [6, 8828.95] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [40, 11193.2] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [40, 13014.1] + - - [64, 4096, 1, 64, 96, 96, 96, 96] + - [13, 2082.83] + - - [64, 4096, 1, 128, 96, 96, 160, 160] + - [62, 3882.72] + - - [64, 4096, 1, 256, 96, 96, 288, 288] + - [6, 6471.46] + - - [64, 4096, 1, 512, 96, 96, 544, 544] + - [76, 10176.5] + - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] + - [23, 14302.8] + - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] + - [35, 17831.5] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [13, 19352.3] + - - [128, 64, 1, 64, 160, 160, 96, 96] + - [7, 68.5075] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [55, 135.143] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [7, 237.503] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [30, 368.407] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [78, 530.59] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [14, 686.833] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [13, 797.246] + - - [128, 128, 1, 64, 160, 160, 96, 96] + - [7, 199.578] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [69, 364.722] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [55, 620.643] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [5, 938.956] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [25, 1223.72] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [63, 1501.72] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [49, 1674.46] + - - [128, 256, 1, 64, 160, 160, 96, 96] + - [41, 393.316] + - - [128, 256, 1, 128, 160, 160, 160, 160] + - [55, 772.289] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [30, 1279.92] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [30, 1908.45] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [63, 2447.09] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [37, 3029.07] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [36, 3350.0] + - - [128, 512, 1, 64, 160, 160, 96, 96] + - [69, 843.246] + - - [128, 512, 1, 128, 160, 160, 160, 160] + - [30, 1536.95] + - - [128, 512, 1, 256, 160, 160, 288, 288] + - [4, 2548.96] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [4, 3685.68] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [77, 4908.13] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [25, 6048.58] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [24, 6708.37] + - - [128, 1024, 1, 64, 160, 160, 96, 96] + - [30, 1525.77] + - - [128, 1024, 1, 128, 160, 160, 160, 160] + - [19, 2786.45] + - - [128, 1024, 1, 256, 160, 160, 288, 288] + - [76, 4559.65] + - - [128, 1024, 1, 512, 160, 160, 544, 544] + - [21, 7148.4] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [24, 9529.14] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [76, 11622.1] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [14, 13149.3] + - - [128, 2048, 1, 64, 160, 160, 96, 96] + - [70, 2594.68] + - - [128, 2048, 1, 128, 160, 160, 160, 160] + - [66, 4902.79] + - - [128, 2048, 1, 256, 160, 160, 288, 288] + - [18, 8348.95] + - - [128, 2048, 1, 512, 160, 160, 544, 544] + - [5, 12993.0] + - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] + - [67, 17579.3] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [40, 22628.9] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [40, 26191.4] + - - [128, 4096, 1, 64, 160, 160, 96, 96] + - [62, 5000.66] + - - [128, 4096, 1, 128, 160, 160, 160, 160] + - [1, 8895.69] + - - [128, 4096, 1, 256, 160, 160, 288, 288] + - [24, 14269.4] + - - [128, 4096, 1, 512, 160, 160, 544, 544] + - [76, 22345.5] + - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] + - [35, 31809.0] + - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] + - [36, 37614.4] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [24, 39328.4] + - - [256, 64, 1, 64, 288, 288, 96, 96] + - [7, 163.279] + - - [256, 64, 1, 128, 288, 288, 160, 160] + - [6, 288.072] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [54, 520.256] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [6, 817.286] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [43, 1104.86] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [25, 1420.35] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [25, 1625.35] + - - [256, 128, 1, 64, 288, 288, 96, 96] + - [66, 414.293] + - - [256, 128, 1, 128, 288, 288, 160, 160] + - [4, 762.462] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [76, 1241.1] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [6, 1902.18] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [13, 2502.76] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [36, 2975.08] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [13, 3369.51] + - - [256, 256, 1, 64, 288, 288, 96, 96] + - [29, 835.19] + - - [256, 256, 1, 128, 288, 288, 160, 160] + - [68, 1447.32] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [29, 2425.5] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [68, 3803.49] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [29, 4981.37] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [52, 6034.98] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [48, 6712.74] + - - [256, 512, 1, 64, 288, 288, 96, 96] + - [54, 1519.68] + - - [256, 512, 1, 128, 288, 288, 160, 160] + - [29, 2781.83] + - - [256, 512, 1, 256, 288, 288, 288, 288] + - [40, 4657.1] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [29, 7127.13] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [28, 9430.05] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [29, 11697.6] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [37, 13136.7] + - - [256, 1024, 1, 64, 288, 288, 96, 96] + - [54, 2661.78] + - - [256, 1024, 1, 128, 288, 288, 160, 160] + - [43, 4934.48] + - - [256, 1024, 1, 256, 288, 288, 288, 288] + - [1, 8247.37] + - - [256, 1024, 1, 512, 288, 288, 544, 544] + - [5, 12712.4] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [27, 18102.1] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [76, 22394.9] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [68, 25998.0] + - - [256, 2048, 1, 64, 288, 288, 96, 96] + - [7, 4817.58] + - - [256, 2048, 1, 128, 288, 288, 160, 160] + - [15, 8664.8] + - - [256, 2048, 1, 256, 288, 288, 288, 288] + - [76, 14918.1] + - - [256, 2048, 1, 512, 288, 288, 544, 544] + - [13, 22453.8] + - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] + - [75, 31701.9] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [13, 37712.2] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [36, 39075.8] + - - [256, 4096, 1, 64, 288, 288, 96, 96] + - [40, 9556.98] + - - [256, 4096, 1, 128, 288, 288, 160, 160] + - [15, 16241.3] + - - [256, 4096, 1, 256, 288, 288, 288, 288] + - [11, 23107.2] + - - [256, 4096, 1, 512, 288, 288, 544, 544] + - [31, 31195.3] + - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] + - [63, 36860.4] + - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] + - [76, 39337.8] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [48, 42022.3] + - - [384, 64, 1, 64, 416, 416, 96, 96] + - [6, 251.256] + - - [384, 64, 1, 128, 416, 416, 160, 160] + - [6, 446.838] + - - [384, 64, 1, 256, 416, 416, 288, 288] + - [68, 796.894] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [20, 1252.28] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [5, 1676.72] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [24, 2141.59] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [25, 2446.61] + - - [384, 128, 1, 64, 416, 416, 96, 96] + - [53, 630.28] + - - [384, 128, 1, 128, 416, 416, 160, 160] + - [21, 1154.18] + - - [384, 128, 1, 256, 416, 416, 288, 288] + - [5, 1919.88] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [69, 2759.72] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [63, 3711.23] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [14, 4546.27] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [14, 5029.4] + - - [384, 256, 1, 64, 416, 416, 96, 96] + - [30, 1199.51] + - - [384, 256, 1, 128, 416, 416, 160, 160] + - [7, 2184.91] + - - [384, 256, 1, 256, 416, 416, 288, 288] + - [55, 3522.66] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [76, 5420.18] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [25, 7347.16] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [67, 8923.26] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [63, 9954.85] + - - [384, 512, 1, 64, 416, 416, 96, 96] + - [68, 2011.66] + - - [384, 512, 1, 128, 416, 416, 160, 160] + - [55, 3736.57] + - - [384, 512, 1, 256, 416, 416, 288, 288] + - [69, 6603.47] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [28, 10171.1] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [19, 13639.1] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [37, 17136.4] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [37, 19473.0] + - - [384, 1024, 1, 64, 416, 416, 96, 96] + - [42, 3976.92] + - - [384, 1024, 1, 128, 416, 416, 160, 160] + - [61, 6604.34] + - - [384, 1024, 1, 256, 416, 416, 288, 288] + - [78, 11686.1] + - - [384, 1024, 1, 512, 416, 416, 544, 544] + - [63, 18004.6] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [37, 23721.8] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [37, 28165.5] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [63, 29953.8] + - - [384, 2048, 1, 64, 416, 416, 96, 96] + - [67, 7821.55] + - - [384, 2048, 1, 128, 416, 416, 160, 160] + - [64, 13411.1] + - - [384, 2048, 1, 256, 416, 416, 288, 288] + - [72, 20886.7] + - - [384, 2048, 1, 512, 416, 416, 544, 544] + - [12, 28102.5] + - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] + - [60, 35289.5] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [35, 37949.5] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [47, 41000.3] + - - [384, 4096, 1, 64, 416, 416, 96, 96] + - [50, 11739.2] + - - [384, 4096, 1, 128, 416, 416, 160, 160] + - [64, 19569.1] + - - [384, 4096, 1, 256, 416, 416, 288, 288] + - [39, 27020.1] + - - [384, 4096, 1, 512, 416, 416, 544, 544] + - [54, 33590.9] + - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] + - [30, 36268.6] + - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] + - [7, 39724.1] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [7, 41645.4] + - - [768, 64, 1, 64, 800, 800, 96, 96] + - [20, 511.584] + - - [768, 64, 1, 128, 800, 800, 160, 160] + - [6, 964.947] + - - [768, 64, 1, 256, 800, 800, 288, 288] + - [68, 1653.91] + - - [768, 64, 1, 512, 800, 800, 544, 544] + - [5, 2522.39] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [76, 3431.16] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [21, 4281.19] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [76, 4884.68] + - - [768, 128, 1, 64, 800, 800, 96, 96] + - [52, 1167.9] + - - [768, 128, 1, 128, 800, 800, 160, 160] + - [29, 2157.94] + - - [768, 128, 1, 256, 800, 800, 288, 288] + - [76, 3510.37] + - - [768, 128, 1, 512, 800, 800, 544, 544] + - [20, 5463.11] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [40, 7291.27] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [78, 8823.54] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [62, 9937.89] + - - [768, 256, 1, 64, 800, 800, 96, 96] + - [54, 2087.41] + - - [768, 256, 1, 128, 800, 800, 160, 160] + - [27, 3835.68] + - - [768, 256, 1, 256, 800, 800, 288, 288] + - [66, 6526.41] + - - [768, 256, 1, 512, 800, 800, 544, 544] + - [52, 10080.4] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [66, 13536.4] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [28, 17018.3] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [36, 19391.0] + - - [768, 512, 1, 64, 800, 800, 96, 96] + - [0, 3637.21] + - - [768, 512, 1, 128, 800, 800, 160, 160] + - [75, 7153.45] + - - [768, 512, 1, 256, 800, 800, 288, 288] + - [62, 11398.9] + - - [768, 512, 1, 512, 800, 800, 544, 544] + - [24, 17105.1] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [36, 23418.3] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [37, 27902.0] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [62, 29851.1] + - - [768, 1024, 1, 64, 800, 800, 96, 96] + - [52, 6690.37] + - - [768, 1024, 1, 128, 800, 800, 160, 160] + - [64, 13391.4] + - - [768, 1024, 1, 256, 800, 800, 288, 288] + - [71, 20878.0] + - - [768, 1024, 1, 512, 800, 800, 544, 544] + - [10, 27938.8] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [37, 35078.9] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [36, 37988.9] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [62, 40917.0] + - - [768, 2048, 1, 64, 800, 800, 96, 96] + - [73, 12139.8] + - - [768, 2048, 1, 128, 800, 800, 160, 160] + - [51, 20062.4] + - - [768, 2048, 1, 256, 800, 800, 288, 288] + - [33, 27213.7] + - - [768, 2048, 1, 512, 800, 800, 544, 544] + - [54, 33451.3] + - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] + - [35, 36754.4] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [60, 40713.7] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [12, 43086.7] + - - [768, 4096, 1, 64, 800, 800, 96, 96] + - [73, 17071.7] + - - [768, 4096, 1, 128, 800, 800, 160, 160] + - [26, 25695.8] + - - [768, 4096, 1, 256, 800, 800, 288, 288] + - [58, 32861.6] + - - [768, 4096, 1, 512, 800, 800, 544, 544] + - [32, 36276.7] + - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] + - [9, 40014.3] + - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] + - [62, 42772.1] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [75, 43035.5] + - - [1536, 64, 1, 64, 1568, 1568, 96, 96] + - [3, 893.549] + - - [1536, 64, 1, 128, 1568, 1568, 160, 160] + - [68, 1823.35] + - - [1536, 64, 1, 256, 1568, 1568, 288, 288] + - [66, 3156.78] + - - [1536, 64, 1, 512, 1568, 1568, 544, 544] + - [4, 4894.17] + - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] + - [66, 6763.21] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [13, 8453.07] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [13, 9663.84] + - - [1536, 128, 1, 64, 1568, 1568, 96, 96] + - [68, 2033.77] + - - [1536, 128, 1, 128, 1568, 1568, 160, 160] + - [4, 3649.35] + - - [1536, 128, 1, 256, 1568, 1568, 288, 288] + - [66, 6246.97] + - - [1536, 128, 1, 512, 1568, 1568, 544, 544] + - [52, 10036.2] + - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] + - [4, 13651.1] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [48, 16991.8] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [52, 19534.0] + - - [1536, 256, 1, 64, 1568, 1568, 96, 96] + - [76, 4064.25] + - - [1536, 256, 1, 128, 1568, 1568, 160, 160] + - [48, 6538.3] + - - [1536, 256, 1, 256, 1568, 1568, 288, 288] + - [48, 11985.2] + - - [1536, 256, 1, 512, 1568, 1568, 544, 544] + - [24, 17247.2] + - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] + - [48, 23529.1] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [62, 28109.4] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [36, 29912.6] + - - [1536, 512, 1, 64, 1568, 1568, 96, 96] + - [27, 8165.42] + - - [1536, 512, 1, 128, 1568, 1568, 160, 160] + - [65, 13421.8] + - - [1536, 512, 1, 256, 1568, 1568, 288, 288] + - [71, 20221.7] + - - [1536, 512, 1, 512, 1568, 1568, 544, 544] + - [36, 28599.6] + - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] + - [62, 35306.6] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [60, 38042.7] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [36, 40905.5] + - - [1536, 1024, 1, 64, 1568, 1568, 96, 96] + - [45, 11680.6] + - - [1536, 1024, 1, 128, 1568, 1568, 160, 160] + - [64, 19427.4] + - - [1536, 1024, 1, 256, 1568, 1568, 288, 288] + - [73, 27391.4] + - - [1536, 1024, 1, 512, 1568, 1568, 544, 544] + - [74, 33371.0] + - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] + - [60, 36429.4] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [35, 40697.3] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [12, 43090.2] + - - [1536, 2048, 1, 64, 1568, 1568, 96, 96] + - [45, 16626.2] + - - [1536, 2048, 1, 128, 1568, 1568, 160, 160] + - [51, 25877.5] + - - [1536, 2048, 1, 256, 1568, 1568, 288, 288] + - [74, 32844.2] + - - [1536, 2048, 1, 512, 1568, 1568, 544, 544] + - [57, 36321.7] + - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 1056] + - [44, 40125.9] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [62, 42773.2] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [75, 43092.5] + - - [1536, 4096, 1, 64, 1568, 1568, 96, 96] + - [73, 21607.4] + - - [1536, 4096, 1, 128, 1568, 1568, 160, 160] + - [45, 30644.5] + - - [1536, 4096, 1, 256, 1568, 1568, 288, 288] + - [34, 34998.2] + - - [1536, 4096, 1, 512, 1568, 1568, 544, 544] + - [32, 39209.6] + - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 1056] + - [36, 42262.3] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] + - [9, 42637.6] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [29, 43208.6] + - - [3072, 64, 1, 64, 3104, 3104, 96, 96] + - [6, 1803.23] + - - [3072, 64, 1, 128, 3104, 3104, 160, 160] + - [62, 2871.5] + - - [3072, 64, 1, 256, 3104, 3104, 288, 288] + - [36, 5087.1] + - - [3072, 64, 1, 512, 3104, 3104, 544, 544] + - [13, 7863.11] + - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] + - [62, 10897.3] + - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] + - [62, 13628.0] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [36, 14815.7] + - - [3072, 128, 1, 64, 3104, 3104, 96, 96] + - [38, 4018.17] + - - [3072, 128, 1, 128, 3104, 3104, 160, 160] + - [36, 6185.55] + - - [3072, 128, 1, 256, 3104, 3104, 288, 288] + - [14, 10363.8] + - - [3072, 128, 1, 512, 3104, 3104, 544, 544] + - [48, 17710.0] + - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] + - [62, 23113.1] + - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] + - [25, 27916.5] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [13, 29818.4] + - - [3072, 256, 1, 64, 3104, 3104, 96, 96] + - [34, 7449.92] + - - [3072, 256, 1, 128, 3104, 3104, 160, 160] + - [64, 13829.3] + - - [3072, 256, 1, 256, 3104, 3104, 288, 288] + - [31, 20772.5] + - - [3072, 256, 1, 512, 3104, 3104, 544, 544] + - [37, 27921.3] + - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] + - [13, 34935.9] + - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] + - [75, 37956.7] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [62, 40886.8] + - - [3072, 512, 1, 64, 3104, 3104, 96, 96] + - [73, 11637.4] + - - [3072, 512, 1, 128, 3104, 3104, 160, 160] + - [50, 19974.9] + - - [3072, 512, 1, 256, 3104, 3104, 288, 288] + - [34, 27012.8] + - - [3072, 512, 1, 512, 3104, 3104, 544, 544] + - [34, 33600.7] + - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] + - [35, 36905.2] + - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] + - [61, 40723.0] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [47, 42817.9] + - - [3072, 1024, 1, 64, 3104, 3104, 96, 96] + - [45, 16557.8] + - - [3072, 1024, 1, 128, 3104, 3104, 160, 160] + - [45, 25375.2] + - - [3072, 1024, 1, 256, 3104, 3104, 288, 288] + - [34, 33008.5] + - - [3072, 1024, 1, 512, 3104, 3104, 544, 544] + - [32, 36453.3] + - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] + - [8, 40048.1] + - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] + - [36, 42814.4] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [75, 43281.0] + - - [3072, 2048, 1, 64, 3104, 3104, 96, 96] + - [73, 22138.4] + - - [3072, 2048, 1, 128, 3104, 3104, 160, 160] + - [73, 30708.8] + - - [3072, 2048, 1, 256, 3104, 3104, 288, 288] + - [46, 35121.0] + - - [3072, 2048, 1, 512, 3104, 3104, 544, 544] + - [44, 39435.7] + - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 1056] + - [36, 42320.8] + - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] + - [69, 42714.1] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [27, 42866.9] + - - [3072, 4096, 1, 64, 3104, 3104, 96, 96] + - [19, 6510.9] + - - [3072, 4096, 1, 128, 3104, 3104, 160, 160] + - [16, 13148.5] + - - [3072, 4096, 1, 256, 3104, 3104, 288, 288] + - [22, 24930.8] + - - [3072, 4096, 1, 512, 3104, 3104, 544, 544] + - [11, 39133.0] + - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 1056] + - [33, 41732.5] + - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 2080] + - [34, 42575.9] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [29, 43103.9] + - - [4096, 64, 1, 64, 4128, 4128, 96, 96] + - [24, 1910.63] + - - [4096, 64, 1, 128, 4128, 4128, 160, 160] + - [76, 3657.97] + - - [4096, 64, 1, 256, 4128, 4128, 288, 288] + - [13, 6304.29] + - - [4096, 64, 1, 512, 4128, 4128, 544, 544] + - [24, 9697.13] + - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] + - [24, 13918.0] + - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] + - [36, 17590.2] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [62, 19365.6] + - - [4096, 128, 1, 64, 4128, 4128, 96, 96] + - [4, 5850.82] + - - [4096, 128, 1, 128, 4128, 4128, 160, 160] + - [2, 10205.2] + - - [4096, 128, 1, 256, 4128, 4128, 288, 288] + - [36, 16625.5] + - - [4096, 128, 1, 512, 4128, 4128, 544, 544] + - [13, 24310.4] + - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] + - [62, 31786.4] + - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] + - [13, 37607.9] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [36, 39160.5] + - - [4096, 256, 1, 64, 4128, 4128, 96, 96] + - [54, 9831.36] + - - [4096, 256, 1, 128, 4128, 4128, 160, 160] + - [17, 16168.9] + - - [4096, 256, 1, 256, 4128, 4128, 288, 288] + - [33, 23303.7] + - - [4096, 256, 1, 512, 4128, 4128, 544, 544] + - [56, 31410.7] + - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] + - [60, 36991.2] + - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] + - [37, 39511.5] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [76, 41894.4] + - - [4096, 512, 1, 64, 4128, 4128, 96, 96] + - [73, 13413.7] + - - [4096, 512, 1, 128, 4128, 4128, 160, 160] + - [64, 22455.7] + - - [4096, 512, 1, 256, 4128, 4128, 288, 288] + - [10, 29743.5] + - - [4096, 512, 1, 512, 4128, 4128, 544, 544] + - [32, 36082.6] + - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] + - [71, 38464.0] + - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] + - [62, 41781.9] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [76, 43408.5] + - - [4096, 1024, 1, 64, 4128, 4128, 96, 96] + - [73, 18740.3] + - - [4096, 1024, 1, 128, 4128, 4128, 160, 160] + - [73, 27851.8] + - - [4096, 1024, 1, 256, 4128, 4128, 288, 288] + - [74, 34575.6] + - - [4096, 1024, 1, 512, 4128, 4128, 544, 544] + - [44, 37737.4] + - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] + - [36, 41071.9] + - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] + - [36, 43425.9] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [63, 43288.9] + - - [4096, 2048, 1, 64, 4128, 4128, 96, 96] + - [58, 22742.0] + - - [4096, 2048, 1, 128, 4128, 4128, 160, 160] + - [33, 31786.3] + - - [4096, 2048, 1, 256, 4128, 4128, 288, 288] + - [58, 36015.3] + - - [4096, 2048, 1, 512, 4128, 4128, 544, 544] + - [59, 39381.7] + - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 1056] + - [33, 41619.2] + - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] + - [27, 41878.8] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [32, 43314.6] + - - [4096, 4096, 1, 64, 4128, 4128, 96, 96] + - [19, 6780.39] + - - [4096, 4096, 1, 128, 4128, 4128, 160, 160] + - [16, 13826.8] + - - [4096, 4096, 1, 256, 4128, 4128, 288, 288] + - [11, 26586.0] + - - [4096, 4096, 1, 512, 4128, 4128, 544, 544] + - [11, 39749.7] + - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 1056] + - [33, 41006.2] + - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 2080] + - [31, 42968.4] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [63, 43313.6] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_I8II_BH_GB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_I8II_BH_GB.yaml new file mode 100644 index 00000000000..bc371234fa6 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_I8II_BH_GB.yaml @@ -0,0 +1,22173 @@ +- {MinimumRequiredVersion: 4.35.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25856 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 20736 + LdsPadA: 8 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA8_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25856 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 20736 + LdsPadA: 8 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA8_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 9 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 10 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 11 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 12 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 13 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 14 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 15 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 16 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 17 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 18 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 19 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 20 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 21 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 22 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 23 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 24 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 25 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM1 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 1 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 26 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 27 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 28 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 29 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 30 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 31 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 32 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 33 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 34 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 35 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 36 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 37 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 38 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 39 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 40 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 41 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25856 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 20736 + LdsPadA: 8 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 42 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA8_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 43 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 44 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 45 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 46 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 47 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 48 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 49 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM4 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 4 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 50 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 51 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 52 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 53 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 54 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 55 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 56 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 57 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 58 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 59 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 60 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 61 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 62 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 63 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU0_SUS0_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 0 + StaggerUMapping: 0 + StaggerUStride: 0 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 0 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 64 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 65 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 66 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 67 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 68 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 69 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA1_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 1 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: 1 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 2 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 10240 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 1 + LoopTail: true + LoopUnroll: 16 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 2 + ScheduleLocalWrite: 1 + SolutionIndex: 70 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB1_GRVW16_IU2_LPA32_LPB32_PLR1_SIA2_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 71 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 72 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 73 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 0 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 74 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR0_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 8 + GlobalLoadVectorWidthB: 8 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 8 + LSPB: 8 + LVCA: 4 + LVCB: 4 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 4 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 4 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 75 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW8_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 64 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 76 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW2_VW2_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 2 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 2 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 25856 + LdsNumElementsAlignedA: 4352 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 4352 + LdsOffsetB_Blk: 20736 + LdsPadA: 8 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 77 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA8_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false + - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 32 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: true + ExpandPointerSwap: true + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 16 + GlobalLoadVectorWidthB: 16 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 16 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 2 + LVCB: 2 + LVPA: 1 + LVPB: 1 + LdcEqualsLdd: false + LdsBlockSizePerPad: 128 + LdsBlockSizePerPadA: 128 + LdsBlockSizePerPadB: 128 + LdsInitCVgprs: false + LdsNumElements: 26624 + LdsNumElementsAlignedA: 5120 + LdsNumElementsAlignedB: 5120 + LdsOffsetA: 0 + LdsOffsetA_Blk: 16384 + LdsOffsetB: 5120 + LdsOffsetB_Blk: 21504 + LdsPadA: 32 + LdsPadB: 32 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 16 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 2 + LoopTail: true + LoopUnroll: 32 + MACInstruction: FMA + MFMA_BF16_1K: false + MIArchVgpr: true + MIBlock: [16, 16, 16, 1, 1, 1] + MIInputPerThread: 16 + MIOutputVectorWidth: 1 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [4, 4] + MIWaveTileA: 4 + MIWaveTileB: 4 + MacroTile0: 128 + MacroTile1: 128 + MacroTileA: 128 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 16 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 16, 1] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 128 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: 1 + PrefetchLocalRead: 1 + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 6 + ConvolutionConfig: [] + DataType: 8 + DestDataType: 6 + Fp16AltImpl: false + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 3 + ScheduleLocalWrite: 1 + SolutionIndex: 78 + SolutionNameMin: Cijk_Alik_Bljk_I8II_BH_MT128x128x32_MI16x16x16x1_SN_1LDSB0_GRVW16_IU1_LPA32_LPB32_PLR1_SIA3_SU32_SUS256_SVW4_VW4_WGM8 + SourceSwap: 1 + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + ThreadSeparateGlobalReadA: 0 + ThreadSeparateGlobalReadB: 0 + ThreadTile: [4, 64] + ThreadTile0: 32 + ThreadTile1: 4 + ThreadTileA: 32 + ThreadTileB: 4 + TransposeLDS: 1 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 4 + WaveSeparateGlobalReadA: 1 + WaveSeparateGlobalReadB: 1 + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 32 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWBforTLUandMI: false +- [2, 3, 0, 1] +- - - [64, 64, 1, 64, 96, 96, 96, 96] + - [2, 38.5506] + - - [64, 64, 1, 128, 96, 96, 160, 160] + - [30, 68.5794] + - - [64, 64, 1, 256, 96, 96, 288, 288] + - [5, 118.685] + - - [64, 64, 1, 512, 96, 96, 544, 544] + - [55, 191.102] + - - [64, 64, 1, 1024, 96, 96, 1056, 1056] + - [24, 266.865] + - - [64, 64, 1, 2048, 96, 96, 2080, 2080] + - [28, 339.675] + - - [64, 64, 1, 4096, 96, 96, 4128, 4128] + - [14, 395.96] + - - [64, 128, 1, 64, 96, 96, 96, 96] + - [21, 68.7323] + - - [64, 128, 1, 128, 96, 96, 160, 160] + - [55, 130.794] + - - [64, 128, 1, 256, 96, 96, 288, 288] + - [21, 228.722] + - - [64, 128, 1, 512, 96, 96, 544, 544] + - [55, 380.85] + - - [64, 128, 1, 1024, 96, 96, 1056, 1056] + - [37, 535.74] + - - [64, 128, 1, 2048, 96, 96, 2080, 2080] + - [30, 685.317] + - - [64, 128, 1, 4096, 96, 96, 4128, 4128] + - [37, 796.167] + - - [64, 256, 1, 64, 96, 96, 96, 96] + - [68, 161.892] + - - [64, 256, 1, 128, 96, 96, 160, 160] + - [7, 302.054] + - - [64, 256, 1, 256, 96, 96, 288, 288] + - [66, 523.176] + - - [64, 256, 1, 512, 96, 96, 544, 544] + - [30, 818.804] + - - [64, 256, 1, 1024, 96, 96, 1056, 1056] + - [4, 1112.33] + - - [64, 256, 1, 2048, 96, 96, 2080, 2080] + - [78, 1401.66] + - - [64, 256, 1, 4096, 96, 96, 4128, 4128] + - [63, 1624.99] + - - [64, 512, 1, 64, 96, 96, 96, 96] + - [68, 345.665] + - - [64, 512, 1, 128, 96, 96, 160, 160] + - [29, 642.313] + - - [64, 512, 1, 256, 96, 96, 288, 288] + - [66, 1102.17] + - - [64, 512, 1, 512, 96, 96, 544, 544] + - [20, 1707.43] + - - [64, 512, 1, 1024, 96, 96, 1056, 1056] + - [24, 2277.35] + - - [64, 512, 1, 2048, 96, 96, 2080, 2080] + - [48, 2887.53] + - - [64, 512, 1, 4096, 96, 96, 4128, 4128] + - [36, 3258.11] + - - [64, 1024, 1, 64, 96, 96, 96, 96] + - [68, 672.056] + - - [64, 1024, 1, 128, 96, 96, 160, 160] + - [20, 1313.18] + - - [64, 1024, 1, 256, 96, 96, 288, 288] + - [27, 2234.88] + - - [64, 1024, 1, 512, 96, 96, 544, 544] + - [68, 3440.08] + - - [64, 1024, 1, 1024, 96, 96, 1056, 1056] + - [4, 4682.47] + - - [64, 1024, 1, 2048, 96, 96, 2080, 2080] + - [13, 5773.56] + - - [64, 1024, 1, 4096, 96, 96, 4128, 4128] + - [13, 6533.51] + - - [64, 2048, 1, 64, 96, 96, 96, 96] + - [27, 1302.98] + - - [64, 2048, 1, 128, 96, 96, 160, 160] + - [54, 2319.22] + - - [64, 2048, 1, 256, 96, 96, 288, 288] + - [43, 3986.52] + - - [64, 2048, 1, 512, 96, 96, 544, 544] + - [27, 6463.34] + - - [64, 2048, 1, 1024, 96, 96, 1056, 1056] + - [6, 8828.95] + - - [64, 2048, 1, 2048, 96, 96, 2080, 2080] + - [40, 11193.2] + - - [64, 2048, 1, 4096, 96, 96, 4128, 4128] + - [40, 13014.1] + - - [64, 4096, 1, 64, 96, 96, 96, 96] + - [13, 2082.83] + - - [64, 4096, 1, 128, 96, 96, 160, 160] + - [62, 3882.72] + - - [64, 4096, 1, 256, 96, 96, 288, 288] + - [6, 6471.46] + - - [64, 4096, 1, 512, 96, 96, 544, 544] + - [76, 10176.5] + - - [64, 4096, 1, 1024, 96, 96, 1056, 1056] + - [23, 14302.8] + - - [64, 4096, 1, 2048, 96, 96, 2080, 2080] + - [35, 17831.5] + - - [64, 4096, 1, 4096, 96, 96, 4128, 4128] + - [13, 19352.3] + - - [128, 64, 1, 64, 160, 160, 96, 96] + - [7, 68.5075] + - - [128, 64, 1, 128, 160, 160, 160, 160] + - [55, 135.143] + - - [128, 64, 1, 256, 160, 160, 288, 288] + - [7, 237.503] + - - [128, 64, 1, 512, 160, 160, 544, 544] + - [30, 368.407] + - - [128, 64, 1, 1024, 160, 160, 1056, 1056] + - [78, 530.59] + - - [128, 64, 1, 2048, 160, 160, 2080, 2080] + - [14, 686.833] + - - [128, 64, 1, 4096, 160, 160, 4128, 4128] + - [13, 797.246] + - - [128, 128, 1, 64, 160, 160, 96, 96] + - [7, 199.578] + - - [128, 128, 1, 128, 160, 160, 160, 160] + - [69, 364.722] + - - [128, 128, 1, 256, 160, 160, 288, 288] + - [55, 620.643] + - - [128, 128, 1, 512, 160, 160, 544, 544] + - [5, 938.956] + - - [128, 128, 1, 1024, 160, 160, 1056, 1056] + - [25, 1223.72] + - - [128, 128, 1, 2048, 160, 160, 2080, 2080] + - [63, 1501.72] + - - [128, 128, 1, 4096, 160, 160, 4128, 4128] + - [49, 1674.46] + - - [128, 256, 1, 64, 160, 160, 96, 96] + - [41, 393.316] + - - [128, 256, 1, 128, 160, 160, 160, 160] + - [55, 772.289] + - - [128, 256, 1, 256, 160, 160, 288, 288] + - [30, 1279.92] + - - [128, 256, 1, 512, 160, 160, 544, 544] + - [30, 1908.45] + - - [128, 256, 1, 1024, 160, 160, 1056, 1056] + - [63, 2447.09] + - - [128, 256, 1, 2048, 160, 160, 2080, 2080] + - [37, 3029.07] + - - [128, 256, 1, 4096, 160, 160, 4128, 4128] + - [36, 3350.0] + - - [128, 512, 1, 64, 160, 160, 96, 96] + - [69, 843.246] + - - [128, 512, 1, 128, 160, 160, 160, 160] + - [30, 1536.95] + - - [128, 512, 1, 256, 160, 160, 288, 288] + - [4, 2548.96] + - - [128, 512, 1, 512, 160, 160, 544, 544] + - [4, 3685.68] + - - [128, 512, 1, 1024, 160, 160, 1056, 1056] + - [77, 4908.13] + - - [128, 512, 1, 2048, 160, 160, 2080, 2080] + - [25, 6048.58] + - - [128, 512, 1, 4096, 160, 160, 4128, 4128] + - [24, 6708.37] + - - [128, 1024, 1, 64, 160, 160, 96, 96] + - [30, 1525.77] + - - [128, 1024, 1, 128, 160, 160, 160, 160] + - [19, 2786.45] + - - [128, 1024, 1, 256, 160, 160, 288, 288] + - [76, 4559.65] + - - [128, 1024, 1, 512, 160, 160, 544, 544] + - [21, 7148.4] + - - [128, 1024, 1, 1024, 160, 160, 1056, 1056] + - [24, 9529.14] + - - [128, 1024, 1, 2048, 160, 160, 2080, 2080] + - [76, 11622.1] + - - [128, 1024, 1, 4096, 160, 160, 4128, 4128] + - [14, 13149.3] + - - [128, 2048, 1, 64, 160, 160, 96, 96] + - [70, 2594.68] + - - [128, 2048, 1, 128, 160, 160, 160, 160] + - [66, 4902.79] + - - [128, 2048, 1, 256, 160, 160, 288, 288] + - [18, 8348.95] + - - [128, 2048, 1, 512, 160, 160, 544, 544] + - [5, 12993.0] + - - [128, 2048, 1, 1024, 160, 160, 1056, 1056] + - [67, 17579.3] + - - [128, 2048, 1, 2048, 160, 160, 2080, 2080] + - [40, 22628.9] + - - [128, 2048, 1, 4096, 160, 160, 4128, 4128] + - [40, 26191.4] + - - [128, 4096, 1, 64, 160, 160, 96, 96] + - [62, 5000.66] + - - [128, 4096, 1, 128, 160, 160, 160, 160] + - [1, 8895.69] + - - [128, 4096, 1, 256, 160, 160, 288, 288] + - [24, 14269.4] + - - [128, 4096, 1, 512, 160, 160, 544, 544] + - [76, 22345.5] + - - [128, 4096, 1, 1024, 160, 160, 1056, 1056] + - [35, 31809.0] + - - [128, 4096, 1, 2048, 160, 160, 2080, 2080] + - [36, 37614.4] + - - [128, 4096, 1, 4096, 160, 160, 4128, 4128] + - [24, 39328.4] + - - [256, 64, 1, 64, 288, 288, 96, 96] + - [7, 163.279] + - - [256, 64, 1, 128, 288, 288, 160, 160] + - [6, 288.072] + - - [256, 64, 1, 256, 288, 288, 288, 288] + - [54, 520.256] + - - [256, 64, 1, 512, 288, 288, 544, 544] + - [6, 817.286] + - - [256, 64, 1, 1024, 288, 288, 1056, 1056] + - [43, 1104.86] + - - [256, 64, 1, 2048, 288, 288, 2080, 2080] + - [25, 1420.35] + - - [256, 64, 1, 4096, 288, 288, 4128, 4128] + - [25, 1625.35] + - - [256, 128, 1, 64, 288, 288, 96, 96] + - [66, 414.293] + - - [256, 128, 1, 128, 288, 288, 160, 160] + - [4, 762.462] + - - [256, 128, 1, 256, 288, 288, 288, 288] + - [76, 1241.1] + - - [256, 128, 1, 512, 288, 288, 544, 544] + - [6, 1902.18] + - - [256, 128, 1, 1024, 288, 288, 1056, 1056] + - [13, 2502.76] + - - [256, 128, 1, 2048, 288, 288, 2080, 2080] + - [36, 2975.08] + - - [256, 128, 1, 4096, 288, 288, 4128, 4128] + - [13, 3369.51] + - - [256, 256, 1, 64, 288, 288, 96, 96] + - [29, 835.19] + - - [256, 256, 1, 128, 288, 288, 160, 160] + - [68, 1447.32] + - - [256, 256, 1, 256, 288, 288, 288, 288] + - [29, 2425.5] + - - [256, 256, 1, 512, 288, 288, 544, 544] + - [68, 3803.49] + - - [256, 256, 1, 1024, 288, 288, 1056, 1056] + - [29, 4981.37] + - - [256, 256, 1, 2048, 288, 288, 2080, 2080] + - [52, 6034.98] + - - [256, 256, 1, 4096, 288, 288, 4128, 4128] + - [48, 6712.74] + - - [256, 512, 1, 64, 288, 288, 96, 96] + - [54, 1519.68] + - - [256, 512, 1, 128, 288, 288, 160, 160] + - [29, 2781.83] + - - [256, 512, 1, 256, 288, 288, 288, 288] + - [40, 4657.1] + - - [256, 512, 1, 512, 288, 288, 544, 544] + - [29, 7127.13] + - - [256, 512, 1, 1024, 288, 288, 1056, 1056] + - [28, 9430.05] + - - [256, 512, 1, 2048, 288, 288, 2080, 2080] + - [29, 11697.6] + - - [256, 512, 1, 4096, 288, 288, 4128, 4128] + - [37, 13136.7] + - - [256, 1024, 1, 64, 288, 288, 96, 96] + - [54, 2661.78] + - - [256, 1024, 1, 128, 288, 288, 160, 160] + - [43, 4934.48] + - - [256, 1024, 1, 256, 288, 288, 288, 288] + - [1, 8247.37] + - - [256, 1024, 1, 512, 288, 288, 544, 544] + - [5, 12712.4] + - - [256, 1024, 1, 1024, 288, 288, 1056, 1056] + - [27, 18102.1] + - - [256, 1024, 1, 2048, 288, 288, 2080, 2080] + - [76, 22394.9] + - - [256, 1024, 1, 4096, 288, 288, 4128, 4128] + - [68, 25998.0] + - - [256, 2048, 1, 64, 288, 288, 96, 96] + - [7, 4817.58] + - - [256, 2048, 1, 128, 288, 288, 160, 160] + - [15, 8664.8] + - - [256, 2048, 1, 256, 288, 288, 288, 288] + - [76, 14918.1] + - - [256, 2048, 1, 512, 288, 288, 544, 544] + - [13, 22453.8] + - - [256, 2048, 1, 1024, 288, 288, 1056, 1056] + - [75, 31701.9] + - - [256, 2048, 1, 2048, 288, 288, 2080, 2080] + - [13, 37712.2] + - - [256, 2048, 1, 4096, 288, 288, 4128, 4128] + - [36, 39075.8] + - - [256, 4096, 1, 64, 288, 288, 96, 96] + - [40, 9556.98] + - - [256, 4096, 1, 128, 288, 288, 160, 160] + - [15, 16241.3] + - - [256, 4096, 1, 256, 288, 288, 288, 288] + - [11, 23107.2] + - - [256, 4096, 1, 512, 288, 288, 544, 544] + - [31, 31195.3] + - - [256, 4096, 1, 1024, 288, 288, 1056, 1056] + - [63, 36860.4] + - - [256, 4096, 1, 2048, 288, 288, 2080, 2080] + - [76, 39337.8] + - - [256, 4096, 1, 4096, 288, 288, 4128, 4128] + - [48, 42022.3] + - - [384, 64, 1, 64, 416, 416, 96, 96] + - [6, 251.256] + - - [384, 64, 1, 128, 416, 416, 160, 160] + - [6, 446.838] + - - [384, 64, 1, 256, 416, 416, 288, 288] + - [68, 796.894] + - - [384, 64, 1, 512, 416, 416, 544, 544] + - [20, 1252.28] + - - [384, 64, 1, 1024, 416, 416, 1056, 1056] + - [5, 1676.72] + - - [384, 64, 1, 2048, 416, 416, 2080, 2080] + - [24, 2141.59] + - - [384, 64, 1, 4096, 416, 416, 4128, 4128] + - [25, 2446.61] + - - [384, 128, 1, 64, 416, 416, 96, 96] + - [53, 630.28] + - - [384, 128, 1, 128, 416, 416, 160, 160] + - [21, 1154.18] + - - [384, 128, 1, 256, 416, 416, 288, 288] + - [5, 1919.88] + - - [384, 128, 1, 512, 416, 416, 544, 544] + - [69, 2759.72] + - - [384, 128, 1, 1024, 416, 416, 1056, 1056] + - [63, 3711.23] + - - [384, 128, 1, 2048, 416, 416, 2080, 2080] + - [14, 4546.27] + - - [384, 128, 1, 4096, 416, 416, 4128, 4128] + - [14, 5029.4] + - - [384, 256, 1, 64, 416, 416, 96, 96] + - [30, 1199.51] + - - [384, 256, 1, 128, 416, 416, 160, 160] + - [7, 2184.91] + - - [384, 256, 1, 256, 416, 416, 288, 288] + - [55, 3522.66] + - - [384, 256, 1, 512, 416, 416, 544, 544] + - [76, 5420.18] + - - [384, 256, 1, 1024, 416, 416, 1056, 1056] + - [25, 7347.16] + - - [384, 256, 1, 2048, 416, 416, 2080, 2080] + - [67, 8923.26] + - - [384, 256, 1, 4096, 416, 416, 4128, 4128] + - [63, 9954.85] + - - [384, 512, 1, 64, 416, 416, 96, 96] + - [68, 2011.66] + - - [384, 512, 1, 128, 416, 416, 160, 160] + - [55, 3736.57] + - - [384, 512, 1, 256, 416, 416, 288, 288] + - [69, 6603.47] + - - [384, 512, 1, 512, 416, 416, 544, 544] + - [28, 10171.1] + - - [384, 512, 1, 1024, 416, 416, 1056, 1056] + - [19, 13639.1] + - - [384, 512, 1, 2048, 416, 416, 2080, 2080] + - [37, 17136.4] + - - [384, 512, 1, 4096, 416, 416, 4128, 4128] + - [37, 19473.0] + - - [384, 1024, 1, 64, 416, 416, 96, 96] + - [42, 3976.92] + - - [384, 1024, 1, 128, 416, 416, 160, 160] + - [61, 6604.34] + - - [384, 1024, 1, 256, 416, 416, 288, 288] + - [78, 11686.1] + - - [384, 1024, 1, 512, 416, 416, 544, 544] + - [63, 18004.6] + - - [384, 1024, 1, 1024, 416, 416, 1056, 1056] + - [37, 23721.8] + - - [384, 1024, 1, 2048, 416, 416, 2080, 2080] + - [37, 28165.5] + - - [384, 1024, 1, 4096, 416, 416, 4128, 4128] + - [63, 29953.8] + - - [384, 2048, 1, 64, 416, 416, 96, 96] + - [67, 7821.55] + - - [384, 2048, 1, 128, 416, 416, 160, 160] + - [64, 13411.1] + - - [384, 2048, 1, 256, 416, 416, 288, 288] + - [72, 20886.7] + - - [384, 2048, 1, 512, 416, 416, 544, 544] + - [12, 28102.5] + - - [384, 2048, 1, 1024, 416, 416, 1056, 1056] + - [60, 35289.5] + - - [384, 2048, 1, 2048, 416, 416, 2080, 2080] + - [35, 37949.5] + - - [384, 2048, 1, 4096, 416, 416, 4128, 4128] + - [47, 41000.3] + - - [384, 4096, 1, 64, 416, 416, 96, 96] + - [50, 11739.2] + - - [384, 4096, 1, 128, 416, 416, 160, 160] + - [64, 19569.1] + - - [384, 4096, 1, 256, 416, 416, 288, 288] + - [39, 27020.1] + - - [384, 4096, 1, 512, 416, 416, 544, 544] + - [54, 33590.9] + - - [384, 4096, 1, 1024, 416, 416, 1056, 1056] + - [30, 36268.6] + - - [384, 4096, 1, 2048, 416, 416, 2080, 2080] + - [7, 39724.1] + - - [384, 4096, 1, 4096, 416, 416, 4128, 4128] + - [7, 41645.4] + - - [768, 64, 1, 64, 800, 800, 96, 96] + - [20, 511.584] + - - [768, 64, 1, 128, 800, 800, 160, 160] + - [6, 964.947] + - - [768, 64, 1, 256, 800, 800, 288, 288] + - [68, 1653.91] + - - [768, 64, 1, 512, 800, 800, 544, 544] + - [5, 2522.39] + - - [768, 64, 1, 1024, 800, 800, 1056, 1056] + - [76, 3431.16] + - - [768, 64, 1, 2048, 800, 800, 2080, 2080] + - [21, 4281.19] + - - [768, 64, 1, 4096, 800, 800, 4128, 4128] + - [76, 4884.68] + - - [768, 128, 1, 64, 800, 800, 96, 96] + - [52, 1167.9] + - - [768, 128, 1, 128, 800, 800, 160, 160] + - [29, 2157.94] + - - [768, 128, 1, 256, 800, 800, 288, 288] + - [76, 3510.37] + - - [768, 128, 1, 512, 800, 800, 544, 544] + - [20, 5463.11] + - - [768, 128, 1, 1024, 800, 800, 1056, 1056] + - [40, 7291.27] + - - [768, 128, 1, 2048, 800, 800, 2080, 2080] + - [78, 8823.54] + - - [768, 128, 1, 4096, 800, 800, 4128, 4128] + - [62, 9937.89] + - - [768, 256, 1, 64, 800, 800, 96, 96] + - [54, 2087.41] + - - [768, 256, 1, 128, 800, 800, 160, 160] + - [27, 3835.68] + - - [768, 256, 1, 256, 800, 800, 288, 288] + - [66, 6526.41] + - - [768, 256, 1, 512, 800, 800, 544, 544] + - [52, 10080.4] + - - [768, 256, 1, 1024, 800, 800, 1056, 1056] + - [66, 13536.4] + - - [768, 256, 1, 2048, 800, 800, 2080, 2080] + - [28, 17018.3] + - - [768, 256, 1, 4096, 800, 800, 4128, 4128] + - [36, 19391.0] + - - [768, 512, 1, 64, 800, 800, 96, 96] + - [0, 3637.21] + - - [768, 512, 1, 128, 800, 800, 160, 160] + - [75, 7153.45] + - - [768, 512, 1, 256, 800, 800, 288, 288] + - [62, 11398.9] + - - [768, 512, 1, 512, 800, 800, 544, 544] + - [24, 17105.1] + - - [768, 512, 1, 1024, 800, 800, 1056, 1056] + - [36, 23418.3] + - - [768, 512, 1, 2048, 800, 800, 2080, 2080] + - [37, 27902.0] + - - [768, 512, 1, 4096, 800, 800, 4128, 4128] + - [62, 29851.1] + - - [768, 1024, 1, 64, 800, 800, 96, 96] + - [52, 6690.37] + - - [768, 1024, 1, 128, 800, 800, 160, 160] + - [64, 13391.4] + - - [768, 1024, 1, 256, 800, 800, 288, 288] + - [71, 20878.0] + - - [768, 1024, 1, 512, 800, 800, 544, 544] + - [10, 27938.8] + - - [768, 1024, 1, 1024, 800, 800, 1056, 1056] + - [37, 35078.9] + - - [768, 1024, 1, 2048, 800, 800, 2080, 2080] + - [36, 37988.9] + - - [768, 1024, 1, 4096, 800, 800, 4128, 4128] + - [62, 40917.0] + - - [768, 2048, 1, 64, 800, 800, 96, 96] + - [73, 12139.8] + - - [768, 2048, 1, 128, 800, 800, 160, 160] + - [51, 20062.4] + - - [768, 2048, 1, 256, 800, 800, 288, 288] + - [33, 27213.7] + - - [768, 2048, 1, 512, 800, 800, 544, 544] + - [54, 33451.3] + - - [768, 2048, 1, 1024, 800, 800, 1056, 1056] + - [35, 36754.4] + - - [768, 2048, 1, 2048, 800, 800, 2080, 2080] + - [60, 40713.7] + - - [768, 2048, 1, 4096, 800, 800, 4128, 4128] + - [12, 43086.7] + - - [768, 4096, 1, 64, 800, 800, 96, 96] + - [73, 17071.7] + - - [768, 4096, 1, 128, 800, 800, 160, 160] + - [26, 25695.8] + - - [768, 4096, 1, 256, 800, 800, 288, 288] + - [58, 32861.6] + - - [768, 4096, 1, 512, 800, 800, 544, 544] + - [32, 36276.7] + - - [768, 4096, 1, 1024, 800, 800, 1056, 1056] + - [9, 40014.3] + - - [768, 4096, 1, 2048, 800, 800, 2080, 2080] + - [62, 42772.1] + - - [768, 4096, 1, 4096, 800, 800, 4128, 4128] + - [75, 43035.5] + - - [1536, 64, 1, 64, 1568, 1568, 96, 96] + - [3, 893.549] + - - [1536, 64, 1, 128, 1568, 1568, 160, 160] + - [68, 1823.35] + - - [1536, 64, 1, 256, 1568, 1568, 288, 288] + - [66, 3156.78] + - - [1536, 64, 1, 512, 1568, 1568, 544, 544] + - [4, 4894.17] + - - [1536, 64, 1, 1024, 1568, 1568, 1056, 1056] + - [66, 6763.21] + - - [1536, 64, 1, 2048, 1568, 1568, 2080, 2080] + - [13, 8453.07] + - - [1536, 64, 1, 4096, 1568, 1568, 4128, 4128] + - [13, 9663.84] + - - [1536, 128, 1, 64, 1568, 1568, 96, 96] + - [68, 2033.77] + - - [1536, 128, 1, 128, 1568, 1568, 160, 160] + - [4, 3649.35] + - - [1536, 128, 1, 256, 1568, 1568, 288, 288] + - [66, 6246.97] + - - [1536, 128, 1, 512, 1568, 1568, 544, 544] + - [52, 10036.2] + - - [1536, 128, 1, 1024, 1568, 1568, 1056, 1056] + - [4, 13651.1] + - - [1536, 128, 1, 2048, 1568, 1568, 2080, 2080] + - [48, 16991.8] + - - [1536, 128, 1, 4096, 1568, 1568, 4128, 4128] + - [52, 19534.0] + - - [1536, 256, 1, 64, 1568, 1568, 96, 96] + - [76, 4064.25] + - - [1536, 256, 1, 128, 1568, 1568, 160, 160] + - [48, 6538.3] + - - [1536, 256, 1, 256, 1568, 1568, 288, 288] + - [48, 11985.2] + - - [1536, 256, 1, 512, 1568, 1568, 544, 544] + - [24, 17247.2] + - - [1536, 256, 1, 1024, 1568, 1568, 1056, 1056] + - [48, 23529.1] + - - [1536, 256, 1, 2048, 1568, 1568, 2080, 2080] + - [62, 28109.4] + - - [1536, 256, 1, 4096, 1568, 1568, 4128, 4128] + - [36, 29912.6] + - - [1536, 512, 1, 64, 1568, 1568, 96, 96] + - [27, 8165.42] + - - [1536, 512, 1, 128, 1568, 1568, 160, 160] + - [65, 13421.8] + - - [1536, 512, 1, 256, 1568, 1568, 288, 288] + - [71, 20221.7] + - - [1536, 512, 1, 512, 1568, 1568, 544, 544] + - [36, 28599.6] + - - [1536, 512, 1, 1024, 1568, 1568, 1056, 1056] + - [62, 35306.6] + - - [1536, 512, 1, 2048, 1568, 1568, 2080, 2080] + - [60, 38042.7] + - - [1536, 512, 1, 4096, 1568, 1568, 4128, 4128] + - [36, 40905.5] + - - [1536, 1024, 1, 64, 1568, 1568, 96, 96] + - [45, 11680.6] + - - [1536, 1024, 1, 128, 1568, 1568, 160, 160] + - [64, 19427.4] + - - [1536, 1024, 1, 256, 1568, 1568, 288, 288] + - [73, 27391.4] + - - [1536, 1024, 1, 512, 1568, 1568, 544, 544] + - [74, 33371.0] + - - [1536, 1024, 1, 1024, 1568, 1568, 1056, 1056] + - [60, 36429.4] + - - [1536, 1024, 1, 2048, 1568, 1568, 2080, 2080] + - [35, 40697.3] + - - [1536, 1024, 1, 4096, 1568, 1568, 4128, 4128] + - [12, 43090.2] + - - [1536, 2048, 1, 64, 1568, 1568, 96, 96] + - [45, 16626.2] + - - [1536, 2048, 1, 128, 1568, 1568, 160, 160] + - [51, 25877.5] + - - [1536, 2048, 1, 256, 1568, 1568, 288, 288] + - [74, 32844.2] + - - [1536, 2048, 1, 512, 1568, 1568, 544, 544] + - [57, 36321.7] + - - [1536, 2048, 1, 1024, 1568, 1568, 1056, 1056] + - [44, 40125.9] + - - [1536, 2048, 1, 2048, 1568, 1568, 2080, 2080] + - [62, 42773.2] + - - [1536, 2048, 1, 4096, 1568, 1568, 4128, 4128] + - [75, 43092.5] + - - [1536, 4096, 1, 64, 1568, 1568, 96, 96] + - [73, 21607.4] + - - [1536, 4096, 1, 128, 1568, 1568, 160, 160] + - [45, 30644.5] + - - [1536, 4096, 1, 256, 1568, 1568, 288, 288] + - [34, 34998.2] + - - [1536, 4096, 1, 512, 1568, 1568, 544, 544] + - [32, 39209.6] + - - [1536, 4096, 1, 1024, 1568, 1568, 1056, 1056] + - [36, 42262.3] + - - [1536, 4096, 1, 2048, 1568, 1568, 2080, 2080] + - [9, 42637.6] + - - [1536, 4096, 1, 4096, 1568, 1568, 4128, 4128] + - [29, 43208.6] + - - [3072, 64, 1, 64, 3104, 3104, 96, 96] + - [6, 1803.23] + - - [3072, 64, 1, 128, 3104, 3104, 160, 160] + - [62, 2871.5] + - - [3072, 64, 1, 256, 3104, 3104, 288, 288] + - [36, 5087.1] + - - [3072, 64, 1, 512, 3104, 3104, 544, 544] + - [13, 7863.11] + - - [3072, 64, 1, 1024, 3104, 3104, 1056, 1056] + - [62, 10897.3] + - - [3072, 64, 1, 2048, 3104, 3104, 2080, 2080] + - [62, 13628.0] + - - [3072, 64, 1, 4096, 3104, 3104, 4128, 4128] + - [36, 14815.7] + - - [3072, 128, 1, 64, 3104, 3104, 96, 96] + - [38, 4018.17] + - - [3072, 128, 1, 128, 3104, 3104, 160, 160] + - [36, 6185.55] + - - [3072, 128, 1, 256, 3104, 3104, 288, 288] + - [14, 10363.8] + - - [3072, 128, 1, 512, 3104, 3104, 544, 544] + - [48, 17710.0] + - - [3072, 128, 1, 1024, 3104, 3104, 1056, 1056] + - [62, 23113.1] + - - [3072, 128, 1, 2048, 3104, 3104, 2080, 2080] + - [25, 27916.5] + - - [3072, 128, 1, 4096, 3104, 3104, 4128, 4128] + - [13, 29818.4] + - - [3072, 256, 1, 64, 3104, 3104, 96, 96] + - [34, 7449.92] + - - [3072, 256, 1, 128, 3104, 3104, 160, 160] + - [64, 13829.3] + - - [3072, 256, 1, 256, 3104, 3104, 288, 288] + - [31, 20772.5] + - - [3072, 256, 1, 512, 3104, 3104, 544, 544] + - [37, 27921.3] + - - [3072, 256, 1, 1024, 3104, 3104, 1056, 1056] + - [13, 34935.9] + - - [3072, 256, 1, 2048, 3104, 3104, 2080, 2080] + - [75, 37956.7] + - - [3072, 256, 1, 4096, 3104, 3104, 4128, 4128] + - [62, 40886.8] + - - [3072, 512, 1, 64, 3104, 3104, 96, 96] + - [73, 11637.4] + - - [3072, 512, 1, 128, 3104, 3104, 160, 160] + - [50, 19974.9] + - - [3072, 512, 1, 256, 3104, 3104, 288, 288] + - [34, 27012.8] + - - [3072, 512, 1, 512, 3104, 3104, 544, 544] + - [34, 33600.7] + - - [3072, 512, 1, 1024, 3104, 3104, 1056, 1056] + - [35, 36905.2] + - - [3072, 512, 1, 2048, 3104, 3104, 2080, 2080] + - [61, 40723.0] + - - [3072, 512, 1, 4096, 3104, 3104, 4128, 4128] + - [47, 42817.9] + - - [3072, 1024, 1, 64, 3104, 3104, 96, 96] + - [45, 16557.8] + - - [3072, 1024, 1, 128, 3104, 3104, 160, 160] + - [45, 25375.2] + - - [3072, 1024, 1, 256, 3104, 3104, 288, 288] + - [34, 33008.5] + - - [3072, 1024, 1, 512, 3104, 3104, 544, 544] + - [32, 36453.3] + - - [3072, 1024, 1, 1024, 3104, 3104, 1056, 1056] + - [8, 40048.1] + - - [3072, 1024, 1, 2048, 3104, 3104, 2080, 2080] + - [36, 42814.4] + - - [3072, 1024, 1, 4096, 3104, 3104, 4128, 4128] + - [75, 43281.0] + - - [3072, 2048, 1, 64, 3104, 3104, 96, 96] + - [73, 22138.4] + - - [3072, 2048, 1, 128, 3104, 3104, 160, 160] + - [73, 30708.8] + - - [3072, 2048, 1, 256, 3104, 3104, 288, 288] + - [46, 35121.0] + - - [3072, 2048, 1, 512, 3104, 3104, 544, 544] + - [44, 39435.7] + - - [3072, 2048, 1, 1024, 3104, 3104, 1056, 1056] + - [36, 42320.8] + - - [3072, 2048, 1, 2048, 3104, 3104, 2080, 2080] + - [69, 42714.1] + - - [3072, 2048, 1, 4096, 3104, 3104, 4128, 4128] + - [27, 42866.9] + - - [3072, 4096, 1, 64, 3104, 3104, 96, 96] + - [19, 6510.9] + - - [3072, 4096, 1, 128, 3104, 3104, 160, 160] + - [16, 13148.5] + - - [3072, 4096, 1, 256, 3104, 3104, 288, 288] + - [22, 24930.8] + - - [3072, 4096, 1, 512, 3104, 3104, 544, 544] + - [11, 39133.0] + - - [3072, 4096, 1, 1024, 3104, 3104, 1056, 1056] + - [33, 41732.5] + - - [3072, 4096, 1, 2048, 3104, 3104, 2080, 2080] + - [34, 42575.9] + - - [3072, 4096, 1, 4096, 3104, 3104, 4128, 4128] + - [29, 43103.9] + - - [4096, 64, 1, 64, 4128, 4128, 96, 96] + - [24, 1910.63] + - - [4096, 64, 1, 128, 4128, 4128, 160, 160] + - [76, 3657.97] + - - [4096, 64, 1, 256, 4128, 4128, 288, 288] + - [13, 6304.29] + - - [4096, 64, 1, 512, 4128, 4128, 544, 544] + - [24, 9697.13] + - - [4096, 64, 1, 1024, 4128, 4128, 1056, 1056] + - [24, 13918.0] + - - [4096, 64, 1, 2048, 4128, 4128, 2080, 2080] + - [36, 17590.2] + - - [4096, 64, 1, 4096, 4128, 4128, 4128, 4128] + - [62, 19365.6] + - - [4096, 128, 1, 64, 4128, 4128, 96, 96] + - [4, 5850.82] + - - [4096, 128, 1, 128, 4128, 4128, 160, 160] + - [2, 10205.2] + - - [4096, 128, 1, 256, 4128, 4128, 288, 288] + - [36, 16625.5] + - - [4096, 128, 1, 512, 4128, 4128, 544, 544] + - [13, 24310.4] + - - [4096, 128, 1, 1024, 4128, 4128, 1056, 1056] + - [62, 31786.4] + - - [4096, 128, 1, 2048, 4128, 4128, 2080, 2080] + - [13, 37607.9] + - - [4096, 128, 1, 4096, 4128, 4128, 4128, 4128] + - [36, 39160.5] + - - [4096, 256, 1, 64, 4128, 4128, 96, 96] + - [54, 9831.36] + - - [4096, 256, 1, 128, 4128, 4128, 160, 160] + - [17, 16168.9] + - - [4096, 256, 1, 256, 4128, 4128, 288, 288] + - [33, 23303.7] + - - [4096, 256, 1, 512, 4128, 4128, 544, 544] + - [56, 31410.7] + - - [4096, 256, 1, 1024, 4128, 4128, 1056, 1056] + - [60, 36991.2] + - - [4096, 256, 1, 2048, 4128, 4128, 2080, 2080] + - [37, 39511.5] + - - [4096, 256, 1, 4096, 4128, 4128, 4128, 4128] + - [76, 41894.4] + - - [4096, 512, 1, 64, 4128, 4128, 96, 96] + - [73, 13413.7] + - - [4096, 512, 1, 128, 4128, 4128, 160, 160] + - [64, 22455.7] + - - [4096, 512, 1, 256, 4128, 4128, 288, 288] + - [10, 29743.5] + - - [4096, 512, 1, 512, 4128, 4128, 544, 544] + - [32, 36082.6] + - - [4096, 512, 1, 1024, 4128, 4128, 1056, 1056] + - [71, 38464.0] + - - [4096, 512, 1, 2048, 4128, 4128, 2080, 2080] + - [62, 41781.9] + - - [4096, 512, 1, 4096, 4128, 4128, 4128, 4128] + - [76, 43408.5] + - - [4096, 1024, 1, 64, 4128, 4128, 96, 96] + - [73, 18740.3] + - - [4096, 1024, 1, 128, 4128, 4128, 160, 160] + - [73, 27851.8] + - - [4096, 1024, 1, 256, 4128, 4128, 288, 288] + - [74, 34575.6] + - - [4096, 1024, 1, 512, 4128, 4128, 544, 544] + - [44, 37737.4] + - - [4096, 1024, 1, 1024, 4128, 4128, 1056, 1056] + - [36, 41071.9] + - - [4096, 1024, 1, 2048, 4128, 4128, 2080, 2080] + - [36, 43425.9] + - - [4096, 1024, 1, 4096, 4128, 4128, 4128, 4128] + - [63, 43288.9] + - - [4096, 2048, 1, 64, 4128, 4128, 96, 96] + - [58, 22742.0] + - - [4096, 2048, 1, 128, 4128, 4128, 160, 160] + - [33, 31786.3] + - - [4096, 2048, 1, 256, 4128, 4128, 288, 288] + - [58, 36015.3] + - - [4096, 2048, 1, 512, 4128, 4128, 544, 544] + - [59, 39381.7] + - - [4096, 2048, 1, 1024, 4128, 4128, 1056, 1056] + - [33, 41619.2] + - - [4096, 2048, 1, 2048, 4128, 4128, 2080, 2080] + - [27, 41878.8] + - - [4096, 2048, 1, 4096, 4128, 4128, 4128, 4128] + - [32, 43314.6] + - - [4096, 4096, 1, 64, 4128, 4128, 96, 96] + - [19, 6780.39] + - - [4096, 4096, 1, 128, 4128, 4128, 160, 160] + - [16, 13826.8] + - - [4096, 4096, 1, 256, 4128, 4128, 288, 288] + - [11, 26586.0] + - - [4096, 4096, 1, 512, 4128, 4128, 544, 544] + - [11, 39749.7] + - - [4096, 4096, 1, 1024, 4128, 4128, 1056, 1056] + - [33, 41006.2] + - - [4096, 4096, 1, 2048, 4128, 4128, 2080, 2080] + - [31, 42968.4] + - - [4096, 4096, 1, 4096, 4128, 4128, 4128, 4128] + - [63, 43313.6] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_SB.yaml b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_SB.yaml new file mode 100644 index 00000000000..f279f00d092 --- /dev/null +++ b/projects/rocblas/library/src/blas3/Tensile/Logic/asm_full/strixhalo/strixhalo_Cijk_Alik_Bljk_SB.yaml @@ -0,0 +1,310 @@ +- {MinimumRequiredVersion: 4.33.0} +- strixhalo +- gfx1151 +- [Device 1586] +- AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] +- - 1LDSBuffer: 0 + AggressivePerfMode: 1 + AssertAlphaValue: false + AssertBetaValue: false + AssertCEqualsD: false + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSizeEqual: {} + AssertSizeGreaterThan: {} + AssertSizeLessThan: {} + AssertSizeMultiple: {} + AssertStrideAEqual: {0: 1} + AssertStrideBEqual: {0: 1} + AssertStrideCEqual: {0: 1} + AssertStrideDEqual: {0: 1} + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + AtomicAddC: false + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + CodeObjectVersion: default + CustomKernelName: '' + DepthU: 8 + DepthULdsDivisor: 1 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + DisableVgprOverlapping: false + EdgeType: ShiftPtr + EnableMatrixInstruction: false + ExpandPointerSwap: 0 + Fp16AltImpl: false + FractionalLoad: 0 + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: SingleBuffer + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [11, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 8 + LSCB: 8 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 32 + LVPB: 32 + LdcEqualsLdd: false + LdsBlockSizePerPad: 0 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsInitCVgprs: false + LdsNumElements: 512 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalReadVectorWidth: 1 + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopIters: 8 + LoopTail: true + LoopUnroll: 8 + MACInstruction: FMA + MIArchVgpr: false + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MagicDivAlg: 2 + MatrixInstruction: [] + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NoLdsWriteCode: false + NoReject: false + NoTailLoop: false + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 4 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 256 + OptNoLoadLoop: 1 + OptPreLoopVmcnt: 0 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackSummationDims: 0 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PersistentKernelAlongBatch: false + PrefetchAcrossPersistent: 0 + PrefetchAcrossPersistentMode: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 0 + DestDataType: 0 + Fp16AltImpl: false + HighPrecisionAccumulate: false + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MirrorDimsA: [] + MirrorDimsB: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + StridedBatched: true + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStridesAB: false + UseInitialStridesCD: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_SB_MT32x32x8_SN_ + SourceSwap: false + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StoreCInUnroll: false + StoreCInUnrollExact: false + StoreCInUnrollInterval: 1 + StoreCInUnrollPostLoop: false + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSyncOpt: 0 + StoreVectorWidth: 4 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + TransposeLDS: 0 + UnrollIncIsDepthU: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + UnrollMemFence: false + Use64bShadowLimit: 1 + UseInstOffsetForGRO: 0 + UseSgprForGRO: -1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: -1 + VectorWidth: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WavefrontSize: 64 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _DepthULds: 8 + _GlobalAccumulation: null + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemC: 0 + _staggerStrideShift: 3 + allowLRVWforTLUandMI: false +- [2, 3, 0, 1] +- - - [126, 126, 2, 66, 126, 126, 66, 66] + - [0, 0] +- null +- null +- DeviceEfficiency diff --git a/projects/rocblas/tensile_tag.txt b/projects/rocblas/tensile_tag.txt index ca12e6a69b2..6ad0ad31125 100644 --- a/projects/rocblas/tensile_tag.txt +++ b/projects/rocblas/tensile_tag.txt @@ -1 +1 @@ -235405f247b95d0a03134aea29c200fc9b2c4cea +f591f88e48f3e7735d550afbc5970de6a8eb0bd4